1 Notes

This report was generated on 2022-10-04 10:03:18. R version: 4.2.1 on aarch64-apple-darwin20. For this report, CRAN packages as of 2022-01-10 were used.

1.1 R-Script & data

The preprocessing and analysis of the data was conducted in the R project for statistical computing. The RMarkdown script used to generate this document and all the resulting data can be downloaded under this link. Through executing main.Rmd, the herein described process can be reproduced and this document can be generated. In the course of this, data from the folder input will be processed and results will be written to output. The html on-line version of the analysis can be accessed through this link.

1.2 GitHub

The code for the herein described process can also be freely downloaded from https://github.com/fernandomillanvillalobos/lsr.

1.3 License

1.4 Data description of output files

1.4.0.1 abc.csv (Example)

Attribute Type Description
a Numeric
b Numeric
c Numeric

1.4.0.2 xyz.csv

2 Set up

## [1] "package package:rmarkdown detached"

2.1 Define packages

# from https://mran.revolutionanalytics.com/web/packages/\
# checkpoint/vignettes/using-checkpoint-with-knitr.html
# if you don't need a package, remove it from here (commenting not sufficient)
# tidyverse: see https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/
cat("
library(rstudioapi)
library(tidyverse)
library(scales)
library(jsonlite)
library(lintr)
library(rmarkdown)
library(data.table)
library(cowplot)
library(extrafont)
library(waldo)
library(psych)
library(ggrepel)
library(skimr)
library(lsr)
library(vcd)
library(vcdExtra)
library(sciplot)
library(gplots)
library(car)
library(lmtest)
library(effects)
library(BayesFactor)
library(itns)
library(janitor)",
file = "manifest.R"
)

2.2 Install packages

# if checkpoint is not yet installed, install it (for people using this
# system for the first time)
if (!require(checkpoint)) {
  if (!require(devtools)) {
    install.packages("devtools", repos = "http://cran.us.r-project.org")
    require(devtools)
  }
  devtools::install_github("RevolutionAnalytics/checkpoint",
    ref = "v0.3.2", # could be adapted later,
    # as of now (beginning of July 2017
    # this is the current release on CRAN)
    repos = "http://cran.us.r-project.org"
  )
  require(checkpoint)
}
# nolint start
if (!dir.exists("~/.checkpoint")) {
  dir.create("~/.checkpoint")
}
# nolint end
# install packages for the specified CRAN snapshot date
checkpoint(
  snapshot_date = package_date,
  project = path_to_wd,
  verbose = T,
  scanForPackages = T,
  use.knitr = F,
  R.version = r_version
)
rm(package_date)

2.3 Load packages

source("manifest.R")
unlink("manifest.R")
sessionInfo()
## R version 4.2.1 Patched (2022-07-22 r82618)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] C/UTF-8/C/C/C/C
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] janitor_2.1.0          itns_0.1.0             BayesFactor_0.9.12-4.3
##  [4] Matrix_1.5-1           coda_0.19-4            effects_4.2-1         
##  [7] lmtest_0.9-39          zoo_1.8-9              car_3.0-12            
## [10] carData_3.0-4          gplots_3.1.1           sciplot_1.2-0         
## [13] vcdExtra_0.7-5         gnm_1.1-1              vcd_1.4-9             
## [16] lsr_0.5.2              skimr_2.1.3            ggrepel_0.9.1         
## [19] psych_2.1.9            waldo_0.3.1            extrafont_0.17        
## [22] cowplot_1.1.1          data.table_1.14.2      rmarkdown_2.11        
## [25] lintr_2.0.1            jsonlite_1.7.2         scales_1.1.1          
## [28] forcats_0.5.1          stringr_1.4.0          dplyr_1.0.7           
## [31] purrr_0.3.4            readr_2.1.0            tidyr_1.1.4           
## [34] tibble_3.1.6           ggplot2_3.3.5          tidyverse_1.3.1       
## [37] checkpoint_1.0.2       rstudioapi_0.13        knitr_1.37            
## 
## loaded via a namespace (and not attached):
##  [1] minqa_1.2.4        colorspace_2.0-2   ellipsis_0.3.2     rprojroot_2.0.2   
##  [5] snakecase_0.11.0   base64enc_0.1-3    fs_1.5.0           MatrixModels_0.5-0
##  [9] remotes_2.4.2      mvtnorm_1.1-3      fansi_0.5.0        lubridate_1.8.0   
## [13] xml2_1.3.2         splines_4.2.1      qvcalc_1.0.2       mnormt_2.0.2      
## [17] cachem_1.0.6       nloptr_1.2.2.3     broom_0.7.10       Rttf2pt1_1.3.9    
## [21] dbplyr_2.1.1       compiler_4.2.1     httr_1.4.2         backports_1.4.0   
## [25] assertthat_0.2.1   fastmap_1.1.0      lazyeval_0.2.2     survey_4.1-1      
## [29] cli_3.1.0          htmltools_0.5.2    tools_4.2.1        gtable_0.3.0      
## [33] glue_1.6.0         Rcpp_1.0.7         cellranger_1.1.0   jquerylib_0.1.4   
## [37] vctrs_0.3.8        nlme_3.1-159       extrafontdb_1.0    insight_0.14.5    
## [41] xfun_0.28          ps_1.6.0           lme4_1.1-27.1      rvest_1.0.2       
## [45] lifecycle_1.0.1    gtools_3.9.2       ca_0.71.1          MASS_7.3-58.1     
## [49] hms_1.1.1          rex_1.2.0          parallel_4.2.1     yaml_2.2.1        
## [53] pbapply_1.5-0      sass_0.4.2         stringi_1.7.5      desc_1.4.0        
## [57] caTools_1.18.2     boot_1.3-28        cyclocomp_1.1.0    repr_1.1.3        
## [61] rlang_0.4.12       pkgconfig_2.0.3    bitops_1.0-7       evaluate_0.14     
## [65] lattice_0.20-45    processx_3.5.2     tidyselect_1.1.1   magrittr_2.0.1    
## [69] R6_2.5.1           generics_0.1.1     DBI_1.1.1          pillar_1.6.4      
## [73] haven_2.4.3        withr_2.4.2        survival_3.4-0     abind_1.4-5       
## [77] nnet_7.3-18        modelr_0.1.8       crayon_1.4.2       relimp_1.0-5      
## [81] KernSmooth_2.23-20 utf8_1.2.2         tmvnsim_1.0-2      tzdb_0.2.0        
## [85] readxl_1.3.1       callr_3.7.0        reprex_2.0.1       digest_0.6.29     
## [89] munsell_0.5.0      bslib_0.4.0        mitools_2.4

2.4 Load additional scripts

# if you want to outsource logic to other script files, see README for
# further information
# Load all visualizations functions as separate scripts
knitr::read_chunk("scripts/dviz.supp.R")
source("scripts/dviz.supp.R")
knitr::read_chunk("scripts/themes.R")
source("scripts/themes.R")
knitr::read_chunk("scripts/plot_grid.R")
source("scripts/plot_grid.R")
knitr::read_chunk("scripts/align_legend.R")
source("scripts/align_legend.R")
knitr::read_chunk("scripts/label_log10.R")
source("scripts/label_log10.R")

3 Programming

3.1 Loops

# the while loop
x <- 0
while (x < 1000) {
  x <- x + 17
}
x
## [1] 1003
# the for loop
for (i in 1:3) {
  print("hello")
}
## [1] "hello"
## [1] "hello"
## [1] "hello"
# a simple example
words <- c("it", "was", "the", "dirty", "end", "of", "winter")
for (w in words) {
  w.length <- nchar(w)
  W <- toupper(w)
  msg <- paste(W, "has", w.length, "letters")
  print(msg)
}
## [1] "IT has 2 letters"
## [1] "WAS has 3 letters"
## [1] "THE has 3 letters"
## [1] "DIRTY has 5 letters"
## [1] "END has 3 letters"
## [1] "OF has 2 letters"
## [1] "WINTER has 6 letters"
# a more complex example
# set up
month <- 0 # count the number of months
balance <- 300000 # initial mortgage balance
payments <- 1600 # monthly payments
interest <- 0.05 # 5% interest rate per year
total.paid <- 0 # track what you've paid the bank

# convert annual interest to a monthly multiplier
monthly.multiplier <- (1 + interest)^(1 / 12)

# keep looping until the loan is paid off...
while (balance > 0) {

  # do the calculations for this month
  month <- month + 1 # one more month
  balance <- balance * monthly.multiplier # add the interest
  balance <- balance - payments # make the payments
  total.paid <- total.paid + payments # track the total paid

  # print the results on screen
  cat("month", month, ": balance", round(balance), "\n")
} # end of loop
## month 1 : balance 299622 
## month 2 : balance 299243 
## month 3 : balance 298862 
## month 4 : balance 298480 
## month 5 : balance 298096 
## month 6 : balance 297710 
## month 7 : balance 297323 
## month 8 : balance 296934 
## month 9 : balance 296544 
## month 10 : balance 296152 
## month 11 : balance 295759 
## month 12 : balance 295364 
## month 13 : balance 294967 
## month 14 : balance 294569 
## month 15 : balance 294169 
## month 16 : balance 293768 
## month 17 : balance 293364 
## month 18 : balance 292960 
## month 19 : balance 292553 
## month 20 : balance 292145 
## month 21 : balance 291735 
## month 22 : balance 291324 
## month 23 : balance 290911 
## month 24 : balance 290496 
## month 25 : balance 290079 
## month 26 : balance 289661 
## month 27 : balance 289241 
## month 28 : balance 288820 
## month 29 : balance 288396 
## month 30 : balance 287971 
## month 31 : balance 287545 
## month 32 : balance 287116 
## month 33 : balance 286686 
## month 34 : balance 286254 
## month 35 : balance 285820 
## month 36 : balance 285385 
## month 37 : balance 284947 
## month 38 : balance 284508 
## month 39 : balance 284067 
## month 40 : balance 283625 
## month 41 : balance 283180 
## month 42 : balance 282734 
## month 43 : balance 282286 
## month 44 : balance 281836 
## month 45 : balance 281384 
## month 46 : balance 280930 
## month 47 : balance 280475 
## month 48 : balance 280018 
## month 49 : balance 279559 
## month 50 : balance 279098 
## month 51 : balance 278635 
## month 52 : balance 278170 
## month 53 : balance 277703 
## month 54 : balance 277234 
## month 55 : balance 276764 
## month 56 : balance 276292 
## month 57 : balance 275817 
## month 58 : balance 275341 
## month 59 : balance 274863 
## month 60 : balance 274382 
## month 61 : balance 273900 
## month 62 : balance 273416 
## month 63 : balance 272930 
## month 64 : balance 272442 
## month 65 : balance 271952 
## month 66 : balance 271460 
## month 67 : balance 270966 
## month 68 : balance 270470 
## month 69 : balance 269972 
## month 70 : balance 269472 
## month 71 : balance 268970 
## month 72 : balance 268465 
## month 73 : balance 267959 
## month 74 : balance 267451 
## month 75 : balance 266941 
## month 76 : balance 266428 
## month 77 : balance 265914 
## month 78 : balance 265397 
## month 79 : balance 264878 
## month 80 : balance 264357 
## month 81 : balance 263834 
## month 82 : balance 263309 
## month 83 : balance 262782 
## month 84 : balance 262253 
## month 85 : balance 261721 
## month 86 : balance 261187 
## month 87 : balance 260651 
## month 88 : balance 260113 
## month 89 : balance 259573 
## month 90 : balance 259031 
## month 91 : balance 258486 
## month 92 : balance 257939 
## month 93 : balance 257390 
## month 94 : balance 256839 
## month 95 : balance 256285 
## month 96 : balance 255729 
## month 97 : balance 255171 
## month 98 : balance 254611 
## month 99 : balance 254048 
## month 100 : balance 253483 
## month 101 : balance 252916 
## month 102 : balance 252346 
## month 103 : balance 251774 
## month 104 : balance 251200 
## month 105 : balance 250623 
## month 106 : balance 250044 
## month 107 : balance 249463 
## month 108 : balance 248879 
## month 109 : balance 248293 
## month 110 : balance 247705 
## month 111 : balance 247114 
## month 112 : balance 246521 
## month 113 : balance 245925 
## month 114 : balance 245327 
## month 115 : balance 244727 
## month 116 : balance 244124 
## month 117 : balance 243518 
## month 118 : balance 242911 
## month 119 : balance 242300 
## month 120 : balance 241687 
## month 121 : balance 241072 
## month 122 : balance 240454 
## month 123 : balance 239834 
## month 124 : balance 239211 
## month 125 : balance 238585 
## month 126 : balance 237958 
## month 127 : balance 237327 
## month 128 : balance 236694 
## month 129 : balance 236058 
## month 130 : balance 235420 
## month 131 : balance 234779 
## month 132 : balance 234136 
## month 133 : balance 233489 
## month 134 : balance 232841 
## month 135 : balance 232189 
## month 136 : balance 231535 
## month 137 : balance 230879 
## month 138 : balance 230219 
## month 139 : balance 229557 
## month 140 : balance 228892 
## month 141 : balance 228225 
## month 142 : balance 227555 
## month 143 : balance 226882 
## month 144 : balance 226206 
## month 145 : balance 225528 
## month 146 : balance 224847 
## month 147 : balance 224163 
## month 148 : balance 223476 
## month 149 : balance 222786 
## month 150 : balance 222094 
## month 151 : balance 221399 
## month 152 : balance 220701 
## month 153 : balance 220000 
## month 154 : balance 219296 
## month 155 : balance 218590 
## month 156 : balance 217880 
## month 157 : balance 217168 
## month 158 : balance 216453 
## month 159 : balance 215735 
## month 160 : balance 215014 
## month 161 : balance 214290 
## month 162 : balance 213563 
## month 163 : balance 212833 
## month 164 : balance 212100 
## month 165 : balance 211364 
## month 166 : balance 210625 
## month 167 : balance 209883 
## month 168 : balance 209138 
## month 169 : balance 208390 
## month 170 : balance 207639 
## month 171 : balance 206885 
## month 172 : balance 206128 
## month 173 : balance 205368 
## month 174 : balance 204605 
## month 175 : balance 203838 
## month 176 : balance 203069 
## month 177 : balance 202296 
## month 178 : balance 201520 
## month 179 : balance 200741 
## month 180 : balance 199959 
## month 181 : balance 199174 
## month 182 : balance 198385 
## month 183 : balance 197593 
## month 184 : balance 196798 
## month 185 : balance 196000 
## month 186 : balance 195199 
## month 187 : balance 194394 
## month 188 : balance 193586 
## month 189 : balance 192775 
## month 190 : balance 191960 
## month 191 : balance 191142 
## month 192 : balance 190321 
## month 193 : balance 189496 
## month 194 : balance 188668 
## month 195 : balance 187837 
## month 196 : balance 187002 
## month 197 : balance 186164 
## month 198 : balance 185323 
## month 199 : balance 184478 
## month 200 : balance 183629 
## month 201 : balance 182777 
## month 202 : balance 181922 
## month 203 : balance 181063 
## month 204 : balance 180201 
## month 205 : balance 179335 
## month 206 : balance 178466 
## month 207 : balance 177593 
## month 208 : balance 176716 
## month 209 : balance 175836 
## month 210 : balance 174953 
## month 211 : balance 174065 
## month 212 : balance 173175 
## month 213 : balance 172280 
## month 214 : balance 171382 
## month 215 : balance 170480 
## month 216 : balance 169575 
## month 217 : balance 168666 
## month 218 : balance 167753 
## month 219 : balance 166836 
## month 220 : balance 165916 
## month 221 : balance 164992 
## month 222 : balance 164064 
## month 223 : balance 163133 
## month 224 : balance 162197 
## month 225 : balance 161258 
## month 226 : balance 160315 
## month 227 : balance 159368 
## month 228 : balance 158417 
## month 229 : balance 157463 
## month 230 : balance 156504 
## month 231 : balance 155542 
## month 232 : balance 154576 
## month 233 : balance 153605 
## month 234 : balance 152631 
## month 235 : balance 151653 
## month 236 : balance 150671 
## month 237 : balance 149685 
## month 238 : balance 148695 
## month 239 : balance 147700 
## month 240 : balance 146702 
## month 241 : balance 145700 
## month 242 : balance 144693 
## month 243 : balance 143683 
## month 244 : balance 142668 
## month 245 : balance 141650 
## month 246 : balance 140627 
## month 247 : balance 139600 
## month 248 : balance 138568 
## month 249 : balance 137533 
## month 250 : balance 136493 
## month 251 : balance 135449 
## month 252 : balance 134401 
## month 253 : balance 133349 
## month 254 : balance 132292 
## month 255 : balance 131231 
## month 256 : balance 130166 
## month 257 : balance 129096 
## month 258 : balance 128022 
## month 259 : balance 126943 
## month 260 : balance 125861 
## month 261 : balance 124773 
## month 262 : balance 123682 
## month 263 : balance 122586 
## month 264 : balance 121485 
## month 265 : balance 120380 
## month 266 : balance 119270 
## month 267 : balance 118156 
## month 268 : balance 117038 
## month 269 : balance 115915 
## month 270 : balance 114787 
## month 271 : balance 113654 
## month 272 : balance 112518 
## month 273 : balance 111376 
## month 274 : balance 110230 
## month 275 : balance 109079 
## month 276 : balance 107923 
## month 277 : balance 106763 
## month 278 : balance 105598 
## month 279 : balance 104428 
## month 280 : balance 103254 
## month 281 : balance 102074 
## month 282 : balance 100890 
## month 283 : balance 99701 
## month 284 : balance 98507 
## month 285 : balance 97309 
## month 286 : balance 96105 
## month 287 : balance 94897 
## month 288 : balance 93683 
## month 289 : balance 92465 
## month 290 : balance 91242 
## month 291 : balance 90013 
## month 292 : balance 88780 
## month 293 : balance 87542 
## month 294 : balance 86298 
## month 295 : balance 85050 
## month 296 : balance 83797 
## month 297 : balance 82538 
## month 298 : balance 81274 
## month 299 : balance 80005 
## month 300 : balance 78731 
## month 301 : balance 77452 
## month 302 : balance 76168 
## month 303 : balance 74878 
## month 304 : balance 73583 
## month 305 : balance 72283 
## month 306 : balance 70977 
## month 307 : balance 69666 
## month 308 : balance 68350 
## month 309 : balance 67029 
## month 310 : balance 65702 
## month 311 : balance 64369 
## month 312 : balance 63032 
## month 313 : balance 61688 
## month 314 : balance 60340 
## month 315 : balance 58986 
## month 316 : balance 57626 
## month 317 : balance 56261 
## month 318 : balance 54890 
## month 319 : balance 53514 
## month 320 : balance 52132 
## month 321 : balance 50744 
## month 322 : balance 49351 
## month 323 : balance 47952 
## month 324 : balance 46547 
## month 325 : balance 45137 
## month 326 : balance 43721 
## month 327 : balance 42299 
## month 328 : balance 40871 
## month 329 : balance 39438 
## month 330 : balance 37998 
## month 331 : balance 36553 
## month 332 : balance 35102 
## month 333 : balance 33645 
## month 334 : balance 32182 
## month 335 : balance 30713 
## month 336 : balance 29238 
## month 337 : balance 27758 
## month 338 : balance 26271 
## month 339 : balance 24778 
## month 340 : balance 23279 
## month 341 : balance 21773 
## month 342 : balance 20262 
## month 343 : balance 18745 
## month 344 : balance 17221 
## month 345 : balance 15691 
## month 346 : balance 14155 
## month 347 : balance 12613 
## month 348 : balance 11064 
## month 349 : balance 9509 
## month 350 : balance 7948 
## month 351 : balance 6380 
## month 352 : balance 4806 
## month 353 : balance 3226 
## month 354 : balance 1639 
## month 355 : balance 46 
## month 356 : balance -1554
# print the total payments at the end
cat("total payments made", total.paid, "\n")
## total payments made 569600

3.2 Conditional statements

# find out what day it is...
today <- Sys.Date() # pull the date from the system clock
day <- weekdays(today) # what day of the week it is_

# now make a choice depending on the day...
if (day == "Monday") {
  print("I don't like Mondays")
} else {
  print("I'm a happy little automaton")
}
## [1] "I'm a happy little automaton"

3.3 Functions

A formal argument can be a symbol (i.e. a variable name such as x or y), a statement of the form symbol = expression (e.g. pch=16) or the special formal argument … (triple dot).

# a simple function
quadruple <- function(x) {
  y <- x * 4
  return(y)
}
quadruple(3)
## [1] 12
# another simple function with two arguments
pow <- function(x, y = 1) { # setting a default value for args
  out <- x^y # raise x to the power y
  return(out)
}
pow(3, 2)
## [1] 9
# using the ... argument
doubleMax <- function(...) {
  max.val <- max(...) # find the largest value in ...
  out <- 2 * max.val # double it
  return(out)
}
doubleMax(4, 6, 9)
## [1] 18

3.4 Implicit loops

# sapply()
words <- c("along", "the", "loom", "of", "the", "land")
sapply(words, FUN = nchar)
## along   the  loom    of   the  land 
##     5     3     4     2     3     4
# tapply()
gender <- c("male", "male", "female", "female", "male")
age <- c(10, 12, 9, 11, 13)
# loop over all values that appear in the INDEX grouping variable
tapply(age, INDEX = gender, FUN = mean)
##   female     male 
## 10.00000 11.66667
# by
by(age, INDICES = gender, FUN = mean) # does the same as tapply
## gender: female
## [1] 10
## ------------------------------------------------------------ 
## gender: male
## [1] 11.66667

4 Descriptive statistics

4.1 Measures of central tendency

# Load data
load("input/aflsmall.Rdata")
table(afl.finalists)
## afl.finalists
##         Adelaide         Brisbane          Carlton      Collingwood 
##               26               25               26               28 
##         Essendon          Fitzroy        Fremantle          Geelong 
##               32                0                6               39 
##         Hawthorn        Melbourne  North Melbourne    Port Adelaide 
##               27               28               28               17 
##         Richmond         St Kilda           Sydney       West Coast 
##                6               24               26               38 
## Western Bulldogs 
##               24
# calculate the mode
m <- modeOf(afl.finalists)
cat("The mode of the dataset is", m, "\n")
## The mode of the dataset is Geelong
# calculate the modal frequency
max <- maxFreq(afl.finalists)
cat("The modal frequency of the dataset is", max, "\n")
## The modal frequency of the dataset is 39

4.2 Measures of variability

# calculate quantiles
quantile(afl.margins, probs = .5)
##  50% 
## 30.5
quantile(afl.margins, probs = c(.25, .75))
##   25%   75% 
## 12.75 50.50
# calculate IQR
i <- IQR(afl.margins)
cat("IQR is", i, "\n")
## IQR is 37.75
# calculate the mean absolute deviation (AAD)
a <- lsr::aad(afl.margins[1:5])
cat("The mean absolute deviation (AAD) is", a, "\n")
## The mean absolute deviation (AAD) is 15.52
# calculate variance (also called mean square deviation)
v <- var(afl.margins[1:5])
cat("The variance is", v, "\n")
## The variance is 405.8
# calculate standard deviation (also called root mean squared deviation)
s <- sd(afl.margins)
cat("The standard deviation is", s, "\n")
## The standard deviation is 26.07364
# calculate median absolute deviation (MAD)
m <- mad(afl.margins, constant = 1)
cat("The median absolute deviation (MAD) is", m, "\n")
## The median absolute deviation (MAD) is 19.5
# calculate skewness
sk <- psych::skew(afl.margins)
cat("The skewness is", sk, "\n")
## The skewness is 0.7671555
# calculate kurtosis
k <- psych::kurtosi(afl.margins)
cat("The kurtosis is", k, "\n")
## The kurtosis is 0.02962633

4.3 Overall data summary

# load data
load("input/clinicaltrial.Rdata")

# for numeric variables
summary(afl.margins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.75   30.50   35.30   50.50  116.00
# for nominal (factor) variables
summary(afl.finalists)
##         Adelaide         Brisbane          Carlton      Collingwood 
##               26               25               26               28 
##         Essendon          Fitzroy        Fremantle          Geelong 
##               32                0                6               39 
##         Hawthorn        Melbourne  North Melbourne    Port Adelaide 
##               27               28               28               17 
##         Richmond         St Kilda           Sydney       West Coast 
##                6               24               26               38 
## Western Bulldogs 
##               24
# for data frames
psych::describe(x = clin.trial)
##           vars  n mean   sd median trimmed  mad min max range skew kurtosis
## drug*        1 18 2.00 0.84   2.00    2.00 1.48 1.0 3.0   2.0 0.00    -1.66
## therapy*     2 18 1.50 0.51   1.50    1.50 0.74 1.0 2.0   1.0 0.00    -2.11
## mood.gain    3 18 0.88 0.53   0.85    0.88 0.67 0.1 1.8   1.7 0.13    -1.44
##             se
## drug*     0.20
## therapy*  0.12
## mood.gain 0.13
# broken down by grouping variable
psych::describeBy(clin.trial, group = clin.trial$therapy)
## 
##  Descriptive statistics by group 
## group: no.therapy
##           vars n mean   sd median trimmed  mad min max range skew kurtosis   se
## drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0 0.00    -1.81 0.29
## therapy*     2 9 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
## mood.gain    3 9 0.72 0.59    0.5    0.72 0.44 0.1 1.7   1.6 0.51    -1.59 0.20
## ------------------------------------------------------------ 
## group: CBT
##           vars n mean   sd median trimmed  mad min max range  skew kurtosis
## drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0  0.00    -1.81
## therapy*     2 9 2.00 0.00    2.0    2.00 0.00 2.0 2.0   0.0   NaN      NaN
## mood.gain    3 9 1.04 0.45    1.1    1.04 0.44 0.3 1.8   1.5 -0.03    -1.12
##             se
## drug*     0.29
## therapy*  0.00
## mood.gain 0.15
by(clin.trial, INDICES = clin.trial$therapy, FUN = describe)
## clin.trial$therapy: no.therapy
##           vars n mean   sd median trimmed  mad min max range skew kurtosis   se
## drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0 0.00    -1.81 0.29
## therapy*     2 9 1.00 0.00    1.0    1.00 0.00 1.0 1.0   0.0  NaN      NaN 0.00
## mood.gain    3 9 0.72 0.59    0.5    0.72 0.44 0.1 1.7   1.6 0.51    -1.59 0.20
## ------------------------------------------------------------ 
## clin.trial$therapy: CBT
##           vars n mean   sd median trimmed  mad min max range  skew kurtosis
## drug*        1 9 2.00 0.87    2.0    2.00 1.48 1.0 3.0   2.0  0.00    -1.81
## therapy*     2 9 2.00 0.00    2.0    2.00 0.00 2.0 2.0   0.0   NaN      NaN
## mood.gain    3 9 1.04 0.45    1.1    1.04 0.44 0.3 1.8   1.5 -0.03    -1.12
##             se
## drug*     0.29
## therapy*  0.00
## mood.gain 0.15
by(clin.trial, INDICES = clin.trial$therapy, FUN = summary)
## clin.trial$therapy: no.therapy
##        drug         therapy    mood.gain     
##  placebo :3   no.therapy:9   Min.   :0.1000  
##  anxifree:3   CBT       :0   1st Qu.:0.3000  
##  joyzepam:3                  Median :0.5000  
##                              Mean   :0.7222  
##                              3rd Qu.:1.3000  
##                              Max.   :1.7000  
## ------------------------------------------------------------ 
## clin.trial$therapy: CBT
##        drug         therapy    mood.gain    
##  placebo :3   no.therapy:0   Min.   :0.300  
##  anxifree:3   CBT       :9   1st Qu.:0.800  
##  joyzepam:3                  Median :1.100  
##                              Mean   :1.044  
##                              3rd Qu.:1.300  
##                              Max.   :1.800
aggregate(mood.gain ~ drug + therapy, data = clin.trial, FUN = mean)
##       drug    therapy mood.gain
## 1  placebo no.therapy  0.300000
## 2 anxifree no.therapy  0.400000
## 3 joyzepam no.therapy  1.466667
## 4  placebo        CBT  0.600000
## 5 anxifree        CBT  1.033333
## 6 joyzepam        CBT  1.500000

4.4 Correlations

# load data
load("input/parenthood.Rdata")

hist(parenthood$dan.grump)

hist(parenthood$dan.sleep)

hist(parenthood$baby.sleep)

plot(x = parenthood$dan.sleep, y = parenthood$dan.grump)

plot(x = parenthood$baby.sleep, y = parenthood$dan.grump)

# calculate correlations (the Pearson's correlation)
cor(x = parenthood$dan.sleep, y = parenthood$dan.grump)
## [1] -0.903384
cor(parenthood)
##              dan.sleep  baby.sleep   dan.grump         day
## dan.sleep   1.00000000  0.62794934 -0.90338404 -0.09840768
## baby.sleep  0.62794934  1.00000000 -0.56596373 -0.01043394
## dan.grump  -0.90338404 -0.56596373  1.00000000  0.07647926
## day        -0.09840768 -0.01043394  0.07647926  1.00000000
# load data
load("input/effort.Rdata")

# calculate correlations (the Spearmans's correlation)
hours_rank <- rank(effort$hours)
grade_rank <- rank(effort$grade)
cor(hours_rank, grade_rank)
## [1] 1
# or
cor(effort$hours, effort$grade, method = "spearman")
## [1] 1
# calculate correlations getting rid of all non-numeric variables
# # load data
load("input/work.Rdata")

lsr::correlate(work)
## 
## CORRELATIONS
## ============
## - correlation type:  pearson 
## - correlations shown only when both variables are numeric
## 
##           hours  tasks   pay    day weekday   week day.type
## hours         .  0.800 0.760 -0.049       .  0.018        .
## tasks     0.800      . 0.720 -0.072       . -0.013        .
## pay       0.760  0.720     .  0.137       .  0.196        .
## day      -0.049 -0.072 0.137      .       .  0.990        .
## weekday       .      .     .      .       .      .        .
## week      0.018 -0.013 0.196  0.990       .      .        .
## day.type      .      .     .      .       .      .        .
lsr::correlate(work, corr.method = "spearman")
## 
## CORRELATIONS
## ============
## - correlation type:  spearman 
## - correlations shown only when both variables are numeric
## 
##           hours  tasks   pay    day weekday   week day.type
## hours         .  0.805 0.745 -0.047       .  0.010        .
## tasks     0.805      . 0.730 -0.068       . -0.008        .
## pay       0.745  0.730     .  0.094       .  0.154        .
## day      -0.047 -0.068 0.094      .       .  0.990        .
## weekday       .      .     .      .       .      .        .
## week      0.010 -0.008 0.154  0.990       .      .        .
## day.type      .      .     .      .       .      .        .
# look at the data
head(body_well)
##      sex  bodysat wellbeing
## 1 female 4.000000       1.0
## 2 female 2.428571       2.4
## 3 female 3.000000       2.3
## 4 female 3.000000       2.4
## 5 female 3.142857       2.8
## 6 female 2.428571       3.2
dplyr::glimpse(body_well)
## Rows: 106
## Columns: 3
## $ sex       <fct> female, female, female, female, female, female, female, fema…
## $ bodysat   <dbl> 4.000000, 2.428571, 3.000000, 3.000000, 3.142857, 2.428571, …
## $ wellbeing <dbl> 1.0, 2.4, 2.3, 2.4, 2.8, 3.2, 3.4, 3.4, 3.6, 3.6, 3.8, 3.8, …
plot(x = body_well$bodysat, y = body_well$wellbeing)

ggplot(body_well, aes(x = bodysat, y = wellbeing)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_cowplot()

# calculating Pearson's correlation coefficient
cor(body_well$bodysat, body_well$wellbeing, method = "spearman")
## [1] 0.4726243
lsr::correlate(x = body_well$bodysat, y = body_well$wellbeing, corr.method = "spearman")
## 
## CORRELATIONS
## ============
## - correlation type:  spearman 
## - correlations shown only when both variables are numeric
## 
##       y.var
## x.var 0.473
# eyeballing r correlation coefficient
ggplot(body_well, aes(x = bodysat, y = wellbeing)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  geom_vline(aes(xintercept = mean(bodysat)), size = .3, linetype = "dashed", color = "red") +
  geom_hline(aes(yintercept = mean(wellbeing)), size = .3, linetype = "dashed", color = "red") +
  theme_cowplot()

# look at the data
head(thomason1)
##   pre post
## 1  13   14
## 2  12   13
## 3  12   16
## 4   9   12
## 5  14   15
## 6  17   18
dplyr::glimpse(thomason1)
## Rows: 12
## Columns: 2
## $ pre  <dbl> 13, 12, 12, 9, 14, 17, 14, 9, 6, 7, 11, 15
## $ post <dbl> 14, 13, 16, 12, 15, 18, 13, 10, 10, 8, 14, 16
plot(x = thomason1$pre, y = thomason1$post)

ggplot(thomason1, aes(x = pre, y = post)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 20), breaks = seq(0, 20, 2)) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 18), breaks = seq(0, 18, 2)) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_cowplot()

# calculating Pearson's correlation coefficient
cor(thomason1$pre, thomason1$post, method = "spearman")
## [1] 0.8424792
lsr::correlate(x = thomason1$pre, y = thomason1$post, corr.method = "spearman")
## 
## CORRELATIONS
## ============
## - correlation type:  spearman 
## - correlations shown only when both variables are numeric
## 
##       y.var
## x.var 0.842
# eyeballing r correlation coefficient
ggplot(thomason1, aes(x = pre, y = post)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 20), breaks = seq(0, 20, 2)) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 18), breaks = seq(0, 18, 2)) +
  geom_vline(aes(xintercept = mean(pre)), size = .3, linetype = "dashed", color = "red") +
  geom_hline(aes(yintercept = mean(post)), size = .3, linetype = "dashed", color = "red") +
  theme_cowplot()

# look at the data
head(sleep_beauty)
##   nightly_sleep_hours rated_attractiveness
## 1                 9.3             3.475840
## 2                11.5             3.813389
## 3                 6.7             4.917097
## 4                 9.3             5.000430
## 5                 9.8             4.685868
## 6                 7.9             4.249000
dplyr::glimpse(sleep_beauty)
## Rows: 70
## Columns: 2
## $ nightly_sleep_hours  <dbl> 9.3, 11.5, 6.7, 9.3, 9.8, 7.9, 8.0, 10.4, 3.7, 10…
## $ rated_attractiveness <dbl> 3.475840, 3.813389, 4.917097, 5.000430, 4.685868,…
# eyeballing r correlation coefficient
ggplot(sleep_beauty, aes(x = nightly_sleep_hours, y = rated_attractiveness)) +
  geom_point(shape = 21, fill = "deepskyblue4", stroke = .5, size = 3, alpha = .7) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 8), breaks = seq(0, 8, 1)) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 14), breaks = seq(0, 14, 1)) +
  geom_vline(aes(xintercept = mean(nightly_sleep_hours)), size = .3, linetype = "dashed", color = "red") +
  geom_hline(aes(yintercept = mean(rated_attractiveness)), size = .3, linetype = "dashed", color = "red") +
  theme_cowplot()

# calculating CI's for r
cor.test(x = sleep_beauty$nightly_sleep_hours, y = sleep_beauty$rated_attractiveness)
## 
##  Pearson's product-moment correlation
## 
## data:  sleep_beauty$nightly_sleep_hours and sleep_beauty$rated_attractiveness
## t = -2.7175, df = 68, p-value = 0.008337
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.51041935 -0.08420143
## sample estimates:
##       cor 
## -0.312983

4.5 Handling missing values

# load data
load("input/parenthood2.Rdata")

# ignore all cases that have any missing values
cor(parenthood2, use = "complete.obs")
##              dan.sleep baby.sleep   dan.grump         day
## dan.sleep   1.00000000  0.6394985 -0.89951468  0.06132891
## baby.sleep  0.63949845  1.0000000 -0.58656066  0.14555814
## dan.grump  -0.89951468 -0.5865607  1.00000000 -0.06816586
## day         0.06132891  0.1455581 -0.06816586  1.00000000
# look at the variables when determining what to drop
cor(parenthood2, use = "pairwise.complete.obs")
##              dan.sleep  baby.sleep    dan.grump          day
## dan.sleep   1.00000000  0.61472303 -0.903442442 -0.076796665
## baby.sleep  0.61472303  1.00000000 -0.567802669  0.058309485
## dan.grump  -0.90344244 -0.56780267  1.000000000  0.005833399
## day        -0.07679667  0.05830949  0.005833399  1.000000000

4.6 Transforming or recoding a variable

# load data
load("input/likert.Rdata")

# make a data frame
df <- data.frame(likert.raw)
# recode the variable (+3 strong agree, 0 no opinion, -3 strong disagree)
df$likert.centred <- df$likert.raw - 4
# create a strength variable
df$opinion.strength <- abs(df$likert.centred)
# create a direction variable (all negative numbers converted to -1, all positives to 1 and zero stays 0)
df$opinion.dir <- sign(df$likert.centred)
df
##    likert.raw likert.centred opinion.strength opinion.dir
## 1           1             -3                3          -1
## 2           7              3                3           1
## 3           3             -1                1          -1
## 4           4              0                0           0
## 5           4              0                0           0
## 6           4              0                0           0
## 7           2             -2                2          -1
## 8           6              2                2           1
## 9           5              1                1           1
## 10          5              1                1           1
# cutting numeric variables into categories
age <- c(60, 58, 24, 26, 34, 42, 31, 30, 33, 2, 9)
age.breaks <- seq(from = 0, to = 60, by = 20)
age.labels <- c("young", "adult", "older")
age.group <- cut(x = age, breaks = age.breaks, labels = age.labels)
age.df <- data.frame(age, age.group)

4.7 Subsetting

# load data
load("input/nightgarden.Rdata")
itng <- data.frame(speaker, utterance)


# using split
speech.by.char <- split(utterance, speaker)
speech.by.char
## $`makka-pakka`
## [1] "pip" "pip" "onk" "onk"
## 
## $tombliboo
## [1] "ee" "oo"
## 
## $`upsy-daisy`
## [1] "pip" "pip" "onk" "onk"
# pull grouping variables out of the list and into workspace
lsr::importList(speech.by.char, ask = FALSE)

# using subset
df <- subset(itng, subset = speaker == "makka-pakka", select = utterance)

4.8 Sorting, flipping and merging data

# load data
load("input/nightgarden2.Rdata")

# sorting a data frame
lsr::sortFrame(garden, speaker, line)
##            speaker utterance line
## case.4 makka-pakka       pip    7
## case.5 makka-pakka       onk    9
## case.3   tombliboo        ee    5
## case.1  upsy-daisy       pip    1
## case.2  upsy-daisy       pip    2
# reverse order
lsr::sortFrame(garden, -speaker, line)
##            speaker utterance line
## case.1  upsy-daisy       pip    1
## case.2  upsy-daisy       pip    2
## case.3   tombliboo        ee    5
## case.4 makka-pakka       pip    7
## case.5 makka-pakka       onk    9

4.9 Working with text

# Shortening a string
animals <- c("cat", "dog", "kangaroo", "whale")

strtrim(animals, width = 3)
## [1] "cat" "dog" "kan" "wha"
substr(animals, start = 2, stop = 3)
## [1] "at" "og" "an" "ha"
# Paste strings together
paste("hello", "world")
## [1] "hello world"
paste0("hello", "world")
## [1] "helloworld"
paste("hello", "world", sep = ".")
## [1] "hello.world"
hw <- c("hello", "world")
ng <- c("nasty", "government")
paste(hw, ng)
## [1] "hello nasty"      "world government"
paste(hw, ng, collapse = " ")
## [1] "hello nasty world government"
# Splitting strings
monkey <- "It was the best of times. It was the blurst of times."
monkey.1 <- strsplit(monkey, split = " ", fixed = T)
monkey.2 <- strsplit(monkey, split = ".", fixed = T)

# Making transformations
old.text <- "albmino"
chartr(old = "alb", new = "chu", old.text)
## [1] "chumino"
# Matching and substituting text
beers <- c("little creatures", "sierra nevada", "coopers pale")
grep(pattern = "er", beers, fixed = T, value = T)
## [1] "sierra nevada" "coopers pale"
gsub(pattern = "a", replacement = "BLAH", beers, fixed = T)
## [1] "little creBLAHtures"    "sierrBLAH nevBLAHdBLAH" "coopers pBLAHle"
sub(pattern = "a", replacement = "BLAH", beers, fixed = T)
## [1] "little creBLAHtures" "sierrBLAH nevada"    "coopers pBLAHle"

5 Inferential statistics

5.1 Introduction to probability

# Functions to draw plots to show probability distributions

# Parameters for plots
emphCol <- rgb(0, 0, 1)
colour <- TRUE
emphColLight <- rgb(.5, .5, 1)
emphGrey <- grey(.5)

# Function to produce a styled binomial plot
binomPlot <- function(n, p, ...) {

  # probabilities of each outcome
  out <- 0:n
  prob <- dbinom(x = out, size = n, prob = p)

  # plot
  plot(
    out, prob,
    type = "h", lwd = 3, ylab = "Probability",
    frame.plot = FALSE, col = ifelse(colour, emphCol, "black"), ...
  )
}
binomPlot(40, 1 / 4, xlab = "Number of skulls observed")

# Function to plot a normal distribution
standardNormal <- function() {

  # draw the plot
  xval <- seq(-3, 3, .01)
  yval <- dnorm(xval, 0, 1)
  plot(xval, yval,
    lwd = 3, ylab = "Probability Density", xlab = "Observed Value",
    frame.plot = FALSE, col = ifelse(colour, emphCol, "black"), type = "l"
  )
}
standardNormal()

# Function to plot the different probability distributions
variateRelations <- function() {

  # generate the data
  n <- 1000
  normal.a <- rnorm(n)
  normal.b <- rnorm(n)
  normal.c <- rnorm(n)
  chi.sq.3 <- (normal.a)^2 + (normal.b)^2 + (normal.c)^2
  normal.d <- rnorm(n)
  t.3 <- normal.d / sqrt(chi.sq.3 / 3)
  chi.sq.20 <- rchisq(n, 20)
  F.3.20 <- (chi.sq.3 / 3) / (chi.sq.20 / 20)

  # histogram for the normal data
  bw <- .25
  hist(normal.a, seq(min(normal.a) - bw, max(normal.a) + bw, bw),
    freq = FALSE, xlim = c(-4, 4),
    col = ifelse(colour, emphColLight, emphGrey),
    border = "white", ylim = c(0, .45), axes = FALSE,
    xlab = "", ylab = "", main = "Simulated Normal Data",
    font.main = 1
  )
  lines(x <- seq(-4, 4, .1), dnorm(x), lwd = 3, col = "black")
  axis(1)

  # histogram for the chi square data
  bw <- .5
  hist(chi.sq.3, seq(0, max(chi.sq.3) + bw, bw),
    freq = FALSE, xlim = c(0, 16),
    col = ifelse(colour, emphColLight, emphGrey),
    border = "white", axes = FALSE, ylim = c(0, .25),
    xlab = "", ylab = "", main = "Simulated Chi-Square Data",
    font.main = 1
  )
  lines(x <- seq(0, 16, .1), dchisq(x, 3), lwd = 3, col = "black")
  axis(1)

  # histogram for the t data
  bw <- .3
  hist(t.3, seq(min(t.3) - bw, max(t.3) + bw, bw),
    freq = FALSE, xlim = c(-5, 5),
    col = ifelse(colour, emphColLight, emphGrey),
    border = "white", axes = FALSE, ylim = c(0, .4),
    xlab = "", ylab = "", main = "Simulated t Data",
    font.main = 1
  )
  lines(x <- seq(-4, 4, .1), dt(x, 3), lwd = 3, col = "black")
  axis(1)

  # histogram for the F dist data
  bw <- .2
  hist(F.3.20, seq(0, max(F.3.20) + bw, bw),
    freq = FALSE, xlim = c(0, 6),
    col = ifelse(colour, emphColLight, emphGrey),
    border = "white", axes = FALSE, ylim = c(0, .7),
    xlab = "", ylab = "", main = "Simulated F Data",
    font.main = 1
  )
  lines(x <- seq(0, 6, .01), df(x, 3, 20), lwd = 3, col = "black")
  axis(1)
}
variateRelations()

# Plotting the sampling distribution of the mean with different sample sizes
samplingDistributions <- function() {

  # Plots histograms of IQ samples and sampling distributions

  plotSamples <- function(n, N) {
    IQ <- rnorm(n, 100, 15 / sqrt(N))
    hist(IQ,
      breaks = seq(10, 180, 5), border = "white", freq = FALSE,
      col = ifelse(colour, emphColLight, emphGrey),
      xlab = "IQ Score", ylab = "", xlim = c(60, 140),
      main = paste("Sample Size =", N), axes = FALSE,
      font.main = 1, ylim = c(0, .07)
    )
    axis(1)
  }

  # population distribution
  x <- 60:140
  y <- dnorm(x, 100, 15)

  # plot two different sample sizes
  plotSamples(10000, 1)
  lines(x, y, lwd = 2, col = "black", type = "l")

  # plot two different sample sizes
  plotSamples(1000, 2)
  lines(x, y, lwd = 2, col = "black", type = "l")

  # plot two different sample sizes
  plotSamples(1000, 10)
  lines(x, y, lwd = 2, col = "black", type = "l")
}
samplingDistributions()

5.2 Confidence interval (CI)

# computing the 2.5th and the 97.5th percentiles (95% CI) of the normal distribution
qnorm(p = c(.025, .975))
## [1] -1.959964  1.959964
# the 15th and 85th (70% CI)
qnorm(p = c(.15, .85))
## [1] -1.036433  1.036433
# calculating quantiles of the t distribution (95% CI)
qt(p = .975, df = 1000 - 1) # large sample
## [1] 1.962341
qt(p = .975, df = 10 - 1) # small sample
## [1] 2.262157
# calculating CI
# load data
load("input/afl24.Rdata")
lsr::ciMean(x = afl$attendance) # 95% CI (default)
##          2.5%    97.5%
## [1,] 31597.32 32593.12
lsr::ciMean(x = afl$attendance, conf = .99) # 99% CI (default)
##          0.5%    99.5%
## [1,] 31440.77 32749.68
# plotting CI
# using bargraph
sciplot::bargraph.CI(
  x.factor = year, # grouping variable
  response = attendance, # outcome variable
  data = afl, # data
  ci.fun = ciMean, # name of the function to calculate CIs
  xlab = "Year",
  ylab = "Average Attendance"
)

# using lineplot
sciplot::lineplot.CI(
  x.factor = year, # grouping variable
  response = attendance, # outcome variable
  data = afl, # data
  ci.fun = ciMean, # name of the function to calculate CIs
  xlab = "Year",
  ylab = "Average Attendance"
)

# using plotmeans
gplots::plotmeans(
  formula = attendance ~ year,
  data = afl,
  n.label = FALSE
)

5.3 Categorical Data Analysis

# load data
load("input/randomness.Rdata")
load("input/chapek9.Rdata")
load("input/salem.Rdata")
load("input/agpp.Rdata")

#---------- The chi-squared goodnest-of-fit test ---------- 

observed <- table(cards$choice_1)
observed
## 
##    clubs diamonds   hearts   spades 
##       35       51       64       50
# R mathematical version
probabilities <- c(clubs = .25, diamonds = .25, hearts = .25, spaces = .25)
N <- 200
expected <- N * probabilities # expected frequencies
expected
##    clubs diamonds   hearts   spaces 
##       50       50       50       50
# difference between null hypothesis expected and actual values
observed - expected
## 
##    clubs diamonds   hearts   spades 
##      -15        1       14        0
# squared that difference to get a collection of numbers that are big whenever the
# null hypothesis makes a bad prediction
(observed - expected)^2
## 
##    clubs diamonds   hearts   spades 
##      225        1      196        0
# getting different error scores
(observed - expected)^2 / expected
## 
##    clubs diamonds   hearts   spades 
##     4.50     0.02     3.92     0.00
# getting the goodness of fit statistic (X^2 or GOF)
sum((observed - expected)^2 / expected)
## [1] 8.44
# calculating the 95th percentile of a chi-squared distribution with k - 1 degrees of freedom
qchisq(p = .95, df = 3)
## [1] 7.814728
# getting the p-value
pchisq(q = 8.44, df = 3, lower.tail = F)
## [1] 0.03774185
# or
1 - pchisq(q = 8.44, df = 3)
## [1] 0.03774185
# all above calculations all at once
lsr::goodnessOfFitTest(cards$choice_1)
## 
##      Chi-square test against specified probabilities
## 
## Data variable:   cards$choice_1 
## 
## Hypotheses: 
##    null:        true probabilities are as specified
##    alternative: true probabilities differ from those specified
## 
## Descriptives: 
##          observed freq. expected freq. specified prob.
## clubs                35             50            0.25
## diamonds             51             50            0.25
## hearts               64             50            0.25
## spades               50             50            0.25
## 
## Test results: 
##    X-squared statistic:  8.44 
##    degrees of freedom:  3 
##    p-value:  0.038
# specifying a different null hypothesis
nullProbs <- c(clubs = .2, diamonds = .3, hearts = .3, spades = .2)
lsr::goodnessOfFitTest(cards$choice_1, p = nullProbs)
## 
##      Chi-square test against specified probabilities
## 
## Data variable:   cards$choice_1 
## 
## Hypotheses: 
##    null:        true probabilities are as specified
##    alternative: true probabilities differ from those specified
## 
## Descriptives: 
##          observed freq. expected freq. specified prob.
## clubs                35             40             0.2
## diamonds             51             60             0.3
## hearts               64             60             0.3
## spades               50             40             0.2
## 
## Test results: 
##    X-squared statistic:  4.742 
##    degrees of freedom:  3 
##    p-value:  0.192
# using chisq.test() for the goodness of fit test -> frequency table (observed)
chisq.test(observed)
## 
##  Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 8.44, df = 3, p-value = 0.03774
chisq.test(x = observed, p = c(clubs = .2, diamonds = .3, hearts = .3, spades = .2))
## 
##  Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 4.7417, df = 3, p-value = 0.1917
#---------- The chi-squared test of independence (or association) test ---------- 

# producing a contingency table
chapekFrequencies <- xtabs(~ choice + species, data = chapek9)
chapekFrequencies
##         species
## choice   robot human
##   puppy     13    15
##   flower    30    13
##   data      44    65
chapekFrequenciesSum <- addmargins(chapekFrequencies, margin = c(1, 2), FUN = sum, quiet = T)

# running the test
lsr::associationTest(formula = ~ choice + species, data = chapek9)
## 
##      Chi-square test of categorical association
## 
## Variables:   choice, species 
## 
## Hypotheses: 
##    null:        variables are independent of one another
##    alternative: some contingency exists between variables
## 
## Observed contingency table:
##         species
## choice   robot human
##   puppy     13    15
##   flower    30    13
##   data      44    65
## 
## Expected contingency table under the null hypothesis:
##         species
## choice   robot human
##   puppy   13.5  14.5
##   flower  20.8  22.2
##   data    52.7  56.3
## 
## Test results: 
##    X-squared statistic:  10.722 
##    degrees of freedom:  2 
##    p-value:  0.005 
## 
## Other information: 
##    estimated effect size (Cramer's v):  0.244
# using chisq.test() for a test of independence (assocition) -> cross-tabulation table (chapekFrequencies)
chisq.test(chapekFrequencies)
## 
##  Pearson's Chi-squared test
## 
## data:  chapekFrequencies
## X-squared = 10.722, df = 2, p-value = 0.004697
#---------- The Fisher exact test ----------

salem.tabs <- table(trial)
salem.tabs
##        on.fire
## happy   FALSE TRUE
##   FALSE     3    3
##   TRUE     10    0
# running the test
fisher.test(salem.tabs)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  salem.tabs
## p-value = 0.03571
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.000000 1.202913
## sample estimates:
## odds ratio 
##          0
#---------- The McNemar test ----------

right.table <- xtabs(~ response_before + response_after, data = agpp)
right.table
##                response_after
## response_before no yes
##             no  65   5
##             yes 25   5
# running the test
mcnemar.test(right.table)
## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  right.table
## McNemar's chi-squared = 12.033, df = 1, p-value = 0.0005226
cardChoices <- xtabs(~ choice_1 + choice_2, data = cards)
cardChoices
##           choice_2
## choice_1   clubs diamonds hearts spades
##   clubs       10        9     10      6
##   diamonds    20        4     13     14
##   hearts      20       18      3     23
##   spades      18       13     15      4
# testing if there is a relation between rows and columns
chisq.test(cardChoices)
## 
##  Pearson's Chi-squared test
## 
## data:  cardChoices
## X-squared = 29.237, df = 9, p-value = 0.0005909
# testing if the row totals (the frequencies for choice_1) are different from the column totals (the frequencies for choice_2)
mcnemar.test(cardChoices)
## 
##  McNemar's Chi-squared test
## 
## data:  cardChoices
## McNemar's chi-squared = 16.033, df = 6, p-value = 0.01358

5.4 Comparing two means

5.4.1 The one-sample z-test

# Load data
load("input/zeppo.Rdata")
grades
##  [1] 50 60 60 64 66 66 67 69 70 74 76 76 77 79 79 79 81 82 82 89
# Calculating z-test
# 1. sample mean
sample.mean <- mean(grades)

# 2. value of the population mean null hypothesis specifies
mu.null <- 67.5

# 3. assumed population standard deviation
sd.true <- 9.5

# 4. sample size
N <- length(grades)

# 5. calculate the true standard error of the mean
sem.true <- sd.true / sqrt(N)

# 6. calculate z-score
z.score <- (sample.mean - mu.null) / sem.true
z.score
## [1] 2.259606
# 7. calculate critical regions (area under the curve)
# upper tail
upper.area <- pnorm(q = z.score, lower.tail = F)
upper.area
## [1] 0.01192287
# lower tail
lower.area <- pnorm(q = -z.score, lower.tail = T)
lower.area
## [1] 0.01192287
# 8. calculate p-value
p.value <- lower.area + upper.area
p.value
## [1] 0.02384574

5.4.2 The one-sample t-test

# running one-sample t-test (two sided test)
lsr::oneSampleTTest(x = grades, mu = 67.5)
## 
##    One sample t-test 
## 
## Data variable:   grades 
## 
## Descriptive statistics: 
##             grades
##    mean     72.300
##    std dev.  9.521
## 
## Hypotheses: 
##    null:        population mean equals 67.5 
##    alternative: population mean not equal to 67.5 
## 
## Test results: 
##    t-statistic:  2.255 
##    degrees of freedom:  19 
##    p-value:  0.036 
## 
## Other information: 
##    two-sided 95% confidence interval:  [67.844, 76.756] 
##    estimated effect size (Cohen's d):  0.504
# one sided test
lsr::oneSampleTTest(x = grades, mu = 67.5, one.sided = "greater")
## 
##    One sample t-test 
## 
## Data variable:   grades 
## 
## Descriptive statistics: 
##             grades
##    mean     72.300
##    std dev.  9.521
## 
## Hypotheses: 
##    null:        population mean less than or equal to 67.5 
##    alternative: population mean greater than 67.5 
## 
## Test results: 
##    t-statistic:  2.255 
##    degrees of freedom:  19 
##    p-value:  0.018 
## 
## Other information: 
##    one-sided 95% confidence interval:  [68.619, Inf] 
##    estimated effect size (Cohen's d):  0.504
# or
lsr::oneSampleTTest(x = grades, mu = 67.5, one.sided = "less")
## 
##    One sample t-test 
## 
## Data variable:   grades 
## 
## Descriptive statistics: 
##             grades
##    mean     72.300
##    std dev.  9.521
## 
## Hypotheses: 
##    null:        population mean greater than or equal to 67.5 
##    alternative: population mean less than 67.5 
## 
## Test results: 
##    t-statistic:  2.255 
##    degrees of freedom:  19 
##    p-value:  0.982 
## 
## Other information: 
##    one-sided 95% confidence interval:  [-Inf, 75.981] 
##    estimated effect size (Cohen's d):  0.504
# using t-test function
t.test(x = grades, mu = 67.5)
## 
##  One Sample t-test
## 
## data:  grades
## t = 2.2547, df = 19, p-value = 0.03615
## alternative hypothesis: true mean is not equal to 67.5
## 95 percent confidence interval:
##  67.84422 76.75578
## sample estimates:
## mean of x 
##      72.3

5.4.3 The independent samples t-test

# Load the data
load("input/harpo.Rdata")
harpo
##    grade      tutor
## 1     65  Anastasia
## 2     72 Bernadette
## 3     66 Bernadette
## 4     74  Anastasia
## 5     73  Anastasia
## 6     71 Bernadette
## 7     66 Bernadette
## 8     76 Bernadette
## 9     69 Bernadette
## 10    79 Bernadette
## 11    73 Bernadette
## 12    62 Bernadette
## 13    83  Anastasia
## 14    76  Anastasia
## 15    69 Bernadette
## 16    68 Bernadette
## 17    65  Anastasia
## 18    86  Anastasia
## 19    60 Bernadette
## 20    70  Anastasia
## 21    80  Anastasia
## 22    73 Bernadette
## 23    68 Bernadette
## 24    55  Anastasia
## 25    67 Bernadette
## 26    78  Anastasia
## 27    78  Anastasia
## 28    90  Anastasia
## 29    77  Anastasia
## 30    74 Bernadette
## 31    56 Bernadette
## 32    68  Anastasia
## 33    74 Bernadette
# Parameters for plots
emphCol <- rgb(0, 0, 1)
emphColLight <- rgb(.5, .5, 1)
emphGrey <- grey(.5)
colour <- TRUE

# Plotting histograms
harpoHist <- function() {
  plotHist <- function(x, ...) {
    hist(x,
      border = "white",
      col = ifelse(colour, emphColLight, emphGrey), ...
    )
    axis(1)
  }

  # Anastasia
  plotHist(harpo$grade[harpo$tutor == "Anastasia"],
    xlim = c(50, 100), xlab = "Grade", main = "Anastasia's students",
    font.main = 1, breaks = seq(50, 100, 5), ylim = c(0, 7)
  )

  # Bernadette
  plotHist(harpo$grade[harpo$tutor == "Bernadette"],
    xlim = c(50, 100), xlab = "Grade", main = "Bernadette's students",
    font.main = 1, breaks = seq(50, 100, 5), ylim = c(0, 7)
  )
}

harpoHist()

# Plotting means and CIs
harpoMeans <- function() {
  plotmeans(
    formula = grade ~ tutor, # plot grade by test time
    data = harpo, # data frame
    n.label = FALSE, # don't show sample size
    xlab = "Class", # x-axis label
    ylab = "Grade" # y-axis label
  )

  # now add the line...
  abline(
    a = 0, # line has an intercept at 0
    b = 1 # and a slope of 1
  )
}

harpoMeans()

# running the Student's independent samples t-test (two sided test)
lsr::independentSamplesTTest(formula = grade ~ tutor, data = harpo, var.equal = T)
## 
##    Student's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means equal for both groups
##    alternative: different population means in each group
## 
## Test results: 
##    t-statistic:  2.115 
##    degrees of freedom:  31 
##    p-value:  0.043 
## 
## Other information: 
##    two-sided 95% confidence interval:  [0.197, 10.759] 
##    estimated effect size (Cohen's d):  0.74
# running the Welch's independent samples t-test (two sided test)
lsr::independentSamplesTTest(formula = grade ~ tutor, data = harpo) # not equal variances assumed
## 
##    Welch's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means equal for both groups
##    alternative: different population means in each group
## 
## Test results: 
##    t-statistic:  2.034 
##    degrees of freedom:  23.025 
##    p-value:  0.054 
## 
## Other information: 
##    two-sided 95% confidence interval:  [-0.092, 11.048] 
##    estimated effect size (Cohen's d):  0.724
# using t.test function (Welch's test)
t.test(formula = grade ~ tutor, data = harpo)
## 
##  Welch Two Sample t-test
## 
## data:  grade by tutor
## t = 2.0342, df = 23.025, p-value = 0.05361
## alternative hypothesis: true difference in means between group Anastasia and group Bernadette is not equal to 0
## 95 percent confidence interval:
##  -0.09249349 11.04804904
## sample estimates:
##  mean in group Anastasia mean in group Bernadette 
##                 74.53333                 69.05556
# Student's test
t.test(formula = grade ~ tutor, data = harpo, var.equal = T)
## 
##  Two Sample t-test
## 
## data:  grade by tutor
## t = 2.1154, df = 31, p-value = 0.04253
## alternative hypothesis: true difference in means between group Anastasia and group Bernadette is not equal to 0
## 95 percent confidence interval:
##   0.1965873 10.7589683
## sample estimates:
##  mean in group Anastasia mean in group Bernadette 
##                 74.53333                 69.05556
# one sided test
lsr::independentSamplesTTest(formula = grade ~ tutor, data = harpo, one.sided = "Anastasia")
## 
##    Welch's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for group 'Anastasia' 
##    alternative: population mean is larger for group 'Anastasia' 
## 
## Test results: 
##    t-statistic:  2.034 
##    degrees of freedom:  23.025 
##    p-value:  0.027 
## 
## Other information: 
##    one-sided 95% confidence interval:  [0.863, Inf] 
##    estimated effect size (Cohen's d):  0.724
# or
lsr::independentSamplesTTest(formula = grade ~ tutor, data = harpo, one.sided = "Bernadette")
## 
##    Welch's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for group 'Bernadette' 
##    alternative: population mean is larger for group 'Bernadette' 
## 
## Test results: 
##    t-statistic:  2.034 
##    degrees of freedom:  23.025 
##    p-value:  0.973 
## 
## Other information: 
##    one-sided 95% confidence interval:  [-Inf, 10.093] 
##    estimated effect size (Cohen's d):  0.724

5.4.4 The paired-samples t-test

# load data
load("input/chico.Rdata")
chico
##           id grade_test1 grade_test2
## 1   student1        42.9        44.6
## 2   student2        51.8        54.0
## 3   student3        71.7        72.3
## 4   student4        51.6        53.4
## 5   student5        63.5        63.8
## 6   student6        58.0        59.3
## 7   student7        59.8        60.8
## 8   student8        50.8        51.6
## 9   student9        62.5        64.3
## 10 student10        61.9        63.2
## 11 student11        50.4        51.8
## 12 student12        52.6        52.2
## 13 student13        63.0        63.0
## 14 student14        58.3        60.5
## 15 student15        53.3        57.1
## 16 student16        58.7        60.1
## 17 student17        50.1        51.7
## 18 student18        64.2        65.6
## 19 student19        57.4        58.3
## 20 student20        57.1        60.1
psych::describe(chico)
##             vars  n  mean   sd median trimmed  mad  min  max range  skew
## id*            1 20 10.50 5.92   10.5   10.50 7.41  1.0 20.0  19.0  0.00
## grade_test1    2 20 56.98 6.62   57.7   56.92 7.71 42.9 71.7  28.8  0.05
## grade_test2    3 20 58.38 6.41   59.7   58.35 6.45 44.6 72.3  27.7 -0.05
##             kurtosis   se
## id*            -1.38 1.32
## grade_test1    -0.35 1.48
## grade_test2    -0.39 1.43
# create a "difference" variable
chico$improvement <- chico$grade_test2 - chico$grade_test1
chico
##           id grade_test1 grade_test2 improvement
## 1   student1        42.9        44.6         1.7
## 2   student2        51.8        54.0         2.2
## 3   student3        71.7        72.3         0.6
## 4   student4        51.6        53.4         1.8
## 5   student5        63.5        63.8         0.3
## 6   student6        58.0        59.3         1.3
## 7   student7        59.8        60.8         1.0
## 8   student8        50.8        51.6         0.8
## 9   student9        62.5        64.3         1.8
## 10 student10        61.9        63.2         1.3
## 11 student11        50.4        51.8         1.4
## 12 student12        52.6        52.2        -0.4
## 13 student13        63.0        63.0         0.0
## 14 student14        58.3        60.5         2.2
## 15 student15        53.3        57.1         3.8
## 16 student16        58.7        60.1         1.4
## 17 student17        50.1        51.7         1.6
## 18 student18        64.2        65.6         1.4
## 19 student19        57.4        58.3         0.9
## 20 student20        57.1        60.1         3.0
# running a paired samples t-test
#
# Option 1:
# a. create a "difference" variable (chico$improvement)
# b. run a one sample t-test on it
lsr::oneSampleTTest(chico$improvement, mu = 0)
## 
##    One sample t-test 
## 
## Data variable:   chico$improvement 
## 
## Descriptive statistics: 
##             improvement
##    mean           1.405
##    std dev.       0.970
## 
## Hypotheses: 
##    null:        population mean equals 0 
##    alternative: population mean not equal to 0 
## 
## Test results: 
##    t-statistic:  6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [0.951, 1.859] 
##    estimated effect size (Cohen's d):  1.448
# Option 2:
# using the pairedSamplesTTest function from lsr package (one sided formula) (two sided test)
lsr::pairedSamplesTTest(formula = ~ grade_test2 + grade_test1, data = chico)
## 
##    Paired samples t-test 
## 
## Variables:  grade_test2 , grade_test1 
## 
## Descriptive statistics: 
##             grade_test2 grade_test1 difference
##    mean          58.385      56.980      1.405
##    std dev.       6.406       6.616      0.970
## 
## Hypotheses: 
##    null:        population means equal for both measurements
##    alternative: different population means for each measurement
## 
## Test results: 
##    t-statistic:  6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [0.951, 1.859] 
##    estimated effect size (Cohen's d):  1.448
#
# one sided test
lsr::pairedSamplesTTest(formula = ~ grade_test2 + grade_test1, data = chico, one.sided = "grade_test2")
## 
##    Paired samples t-test 
## 
## Variables:  grade_test2 , grade_test1 
## 
## Descriptive statistics: 
##             grade_test2 grade_test1 difference
##    mean          58.385      56.980      1.405
##    std dev.       6.406       6.616      0.970
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for measurement 'grade_test2' 
##    alternative: population mean is larger for measurement 'grade_test2' 
## 
## Test results: 
##    t-statistic:  6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    one-sided 95% confidence interval:  [1.03, Inf] 
##    estimated effect size (Cohen's d):  1.448
# running a paired samples t-test from long form data
# first, format df from wide to long
chico2 <- wideToLong(chico, within = "time")
chico2
##           id improvement  time grade
## 1   student1         1.7 test1  42.9
## 2   student2         2.2 test1  51.8
## 3   student3         0.6 test1  71.7
## 4   student4         1.8 test1  51.6
## 5   student5         0.3 test1  63.5
## 6   student6         1.3 test1  58.0
## 7   student7         1.0 test1  59.8
## 8   student8         0.8 test1  50.8
## 9   student9         1.8 test1  62.5
## 10 student10         1.3 test1  61.9
## 11 student11         1.4 test1  50.4
## 12 student12        -0.4 test1  52.6
## 13 student13         0.0 test1  63.0
## 14 student14         2.2 test1  58.3
## 15 student15         3.8 test1  53.3
## 16 student16         1.4 test1  58.7
## 17 student17         1.6 test1  50.1
## 18 student18         1.4 test1  64.2
## 19 student19         0.9 test1  57.4
## 20 student20         3.0 test1  57.1
## 21  student1         1.7 test2  44.6
## 22  student2         2.2 test2  54.0
## 23  student3         0.6 test2  72.3
## 24  student4         1.8 test2  53.4
## 25  student5         0.3 test2  63.8
## 26  student6         1.3 test2  59.3
## 27  student7         1.0 test2  60.8
## 28  student8         0.8 test2  51.6
## 29  student9         1.8 test2  64.3
## 30 student10         1.3 test2  63.2
## 31 student11         1.4 test2  51.8
## 32 student12        -0.4 test2  52.2
## 33 student13         0.0 test2  63.0
## 34 student14         2.2 test2  60.5
## 35 student15         3.8 test2  57.1
## 36 student16         1.4 test2  60.1
## 37 student17         1.6 test2  51.7
## 38 student18         1.4 test2  65.6
## 39 student19         0.9 test2  58.3
## 40 student20         3.0 test2  60.1
# second, using the pairedSamplesTTest (two sided formula)
# Option 1 (two sided test)
lsr::pairedSamplesTTest(formula = grade ~ time, data = chico2, id = "id")
## 
##    Paired samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  time 
## ID variable:        id 
## 
## Descriptive statistics: 
##              test1  test2 difference
##    mean     56.980 58.385     -1.405
##    std dev.  6.616  6.406      0.970
## 
## Hypotheses: 
##    null:        population means equal for both measurements
##    alternative: different population means for each measurement
## 
## Test results: 
##    t-statistic:  -6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [-1.859, -0.951] 
##    estimated effect size (Cohen's d):  1.448
# one sided test
lsr::pairedSamplesTTest(formula = grade ~ time, data = chico2, id = "id", one.sided = "test2")
## 
##    Paired samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  time 
## ID variable:        id 
## 
## Descriptive statistics: 
##              test1  test2 difference
##    mean     56.980 58.385     -1.405
##    std dev.  6.616  6.406      0.970
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for measurement 'test2' 
##    alternative: population mean is larger for measurement 'test2' 
## 
## Test results: 
##    t-statistic:  -6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    one-sided 95% confidence interval:  [-Inf, -1.03] 
##    estimated effect size (Cohen's d):  1.448
# Option 2
lsr::pairedSamplesTTest(formula = grade ~ time + (id), data = chico2)
## 
##    Paired samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  time 
## ID variable:        id 
## 
## Descriptive statistics: 
##              test1  test2 difference
##    mean     56.980 58.385     -1.405
##    std dev.  6.616  6.406      0.970
## 
## Hypotheses: 
##    null:        population means equal for both measurements
##    alternative: different population means for each measurement
## 
## Test results: 
##    t-statistic:  -6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [-1.859, -0.951] 
##    estimated effect size (Cohen's d):  1.448
# using t.test function
t.test(x = chico$grade_test2, y = chico$grade_test1, paired = T) # the first element of x and y must correspond to the same person
## 
##  Paired t-test
## 
## data:  chico$grade_test2 and chico$grade_test1
## t = 6.4754, df = 19, p-value = 0.000003321
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  0.9508686 1.8591314
## sample estimates:
## mean difference 
##           1.405
# one sided test
lsr::pairedSamplesTTest(formula = grade ~ time + (id), data = chico2, one.sided = "test2")
## 
##    Paired samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  time 
## ID variable:        id 
## 
## Descriptive statistics: 
##              test1  test2 difference
##    mean     56.980 58.385     -1.405
##    std dev.  6.616  6.406      0.970
## 
## Hypotheses: 
##    null:        population means are equal, or smaller for measurement 'test2' 
##    alternative: population mean is larger for measurement 'test2' 
## 
## Test results: 
##    t-statistic:  -6.475 
##    degrees of freedom:  19 
##    p-value:  <.001 
## 
## Other information: 
##    one-sided 95% confidence interval:  [-Inf, -1.03] 
##    estimated effect size (Cohen's d):  1.448

5.4.5 Effect size

# Cohen's d from one sample
lsr::cohensD(x = grades, mu = 67.5)
## [1] 0.5041691
# Cohen's d from a Student t-test
lsr::cohensD(formula = grade ~ tutor, data = harpo, method = "pooled")
## [1] 0.7395614
# Cohen's d from a Welch t-test
lsr::cohensD(formula = grade ~ tutor, data = harpo, method = "unequal")
## [1] 0.7244995

5.4.6 Checking the normality of a sample

# generating data
normal.data <- rnorm(n = 100)

# plotting data
hist(x = normal.data)

qqnorm(y = normal.data)

# running a Shapiro-Wilk test
shapiro.test(x = normal.data)
## 
##  Shapiro-Wilk normality test
## 
## data:  normal.data
## W = 0.99108, p-value = 0.7515

5.4.7 Testing non-normal data with Wilcoxon tests

# load data
load("input/awesome.Rdata")
load("input/awesome2.Rdata")
load("input/happy.Rdata")
awesome
##    scores group
## 1     6.4     A
## 2    10.7     A
## 3    11.9     A
## 4     7.3     A
## 5    10.0     A
## 6    14.5     B
## 7    10.4     B
## 8    12.9     B
## 9    11.7     B
## 10   13.0     B
score.A
## [1]  6.4 10.7 11.9  7.3 10.0
score.B
## [1] 14.5 10.4 12.9 11.7 13.0
happiness
##    before after change
## 1      30     6    -24
## 2      43    29    -14
## 3      21    11    -10
## 4      24    31      7
## 5      23    17     -6
## 6      40     2    -38
## 7      29    31      2
## 8      56    21    -35
## 9      38     8    -30
## 10     16    21      5
# running the two sample Wilcoxon test
wilcox.test(formula = scores ~ group, data = awesome)
## 
##  Wilcoxon rank sum exact test
## 
## data:  scores by group
## W = 3, p-value = 0.05556
## alternative hypothesis: true location shift is not equal to 0
# for data stored separately for each group
wilcox.test(x = score.A, y = score.B)
## 
##  Wilcoxon rank sum exact test
## 
## data:  score.A and score.B
## W = 3, p-value = 0.05556
## alternative hypothesis: true location shift is not equal to 0
# running the one sample Wilcoxon test
wilcox.test(x = happiness$change, mu = 0)
## 
##  Wilcoxon signed rank exact test
## 
## data:  happiness$change
## V = 7, p-value = 0.03711
## alternative hypothesis: true location is not equal to 0
# or
wilcox.test(x = happiness$after, y = happiness$before, paired = T)
## 
##  Wilcoxon signed rank exact test
## 
## data:  happiness$after and happiness$before
## V = 7, p-value = 0.03711
## alternative hypothesis: true location shift is not equal to 0

5.5 Comparing several means (one-way ANOVA)

# load data
load("input/clinicaltrial.Rdata")
clin.trial
##        drug    therapy mood.gain
## 1   placebo no.therapy       0.5
## 2   placebo no.therapy       0.3
## 3   placebo no.therapy       0.1
## 4  anxifree no.therapy       0.6
## 5  anxifree no.therapy       0.4
## 6  anxifree no.therapy       0.2
## 7  joyzepam no.therapy       1.4
## 8  joyzepam no.therapy       1.7
## 9  joyzepam no.therapy       1.3
## 10  placebo        CBT       0.6
## 11  placebo        CBT       0.9
## 12  placebo        CBT       0.3
## 13 anxifree        CBT       1.1
## 14 anxifree        CBT       0.8
## 15 anxifree        CBT       1.2
## 16 joyzepam        CBT       1.8
## 17 joyzepam        CBT       1.3
## 18 joyzepam        CBT       1.4
# descriptive statistics
# how many people in each group
xtabs(~drug, clin.trial)
## drug
##  placebo anxifree joyzepam 
##        6        6        6
# calculating means and sd
mood.gain.mean <- aggregate(mood.gain ~ drug, clin.trial, mean) |>
dplyr::rename(mood.gain.mean = mood.gain)

mood.gain.sd <- aggregate(mood.gain ~ drug, clin.trial, sd) |>
dplyr::rename(mood.gain.sd = mood.gain)

mood.gain.mean
##       drug mood.gain.mean
## 1  placebo      0.4500000
## 2 anxifree      0.7166667
## 3 joyzepam      1.4833333
mood.gain.sd
##       drug mood.gain.sd
## 1  placebo    0.2810694
## 2 anxifree    0.3920034
## 3 joyzepam    0.2136976
# plotting the results
gplots::plotmeans(
  formula = mood.gain ~ drug,
  data = clin.trial,
  xlab = "Drug Administered",
  ylab = "Mood Gain",
  n.label = F
)

# calculating the within-group sum of squares
outcome <- clin.trial$mood.gain
group <- clin.trial$drug
gp.means <- tapply(outcome, group, mean)
gp.means <- gp.means[group]
dev.from.gp.means <- outcome - gp.means
squared.devs <- dev.from.gp.means^2
Y <- data.frame(group, outcome, gp.means, dev.from.gp.means, squared.devs)
Y
##       group outcome  gp.means dev.from.gp.means squared.devs
## 1   placebo     0.5 0.4500000        0.05000000  0.002500000
## 2   placebo     0.3 0.4500000       -0.15000000  0.022500000
## 3   placebo     0.1 0.4500000       -0.35000000  0.122500000
## 4  anxifree     0.6 0.7166667       -0.11666667  0.013611111
## 5  anxifree     0.4 0.7166667       -0.31666667  0.100277778
## 6  anxifree     0.2 0.7166667       -0.51666667  0.266944444
## 7  joyzepam     1.4 1.4833333       -0.08333333  0.006944444
## 8  joyzepam     1.7 1.4833333        0.21666667  0.046944444
## 9  joyzepam     1.3 1.4833333       -0.18333333  0.033611111
## 10  placebo     0.6 0.4500000        0.15000000  0.022500000
## 11  placebo     0.9 0.4500000        0.45000000  0.202500000
## 12  placebo     0.3 0.4500000       -0.15000000  0.022500000
## 13 anxifree     1.1 0.7166667        0.38333333  0.146944444
## 14 anxifree     0.8 0.7166667        0.08333333  0.006944444
## 15 anxifree     1.2 0.7166667        0.48333333  0.233611111
## 16 joyzepam     1.8 1.4833333        0.31666667  0.100277778
## 17 joyzepam     1.3 1.4833333       -0.18333333  0.033611111
## 18 joyzepam     1.4 1.4833333       -0.08333333  0.006944444
SSw <- sum(squared.devs)
SSw
## [1] 1.391667
# calculating the between-group sum of squares
gp.means <- tapply(outcome, group, mean)
grand.mean <- mean(outcome)
dev.from.grand.mean <- gp.means - grand.mean
squared.devs <- dev.from.grand.mean^2
gp.sizes <- tapply(outcome, group, length)
wt.squared.devs <- gp.sizes * squared.devs
Y <- data.frame(gp.means, grand.mean, dev.from.grand.mean, squared.devs, gp.sizes, wt.squared.devs)
Y
##           gp.means grand.mean dev.from.grand.mean squared.devs gp.sizes
## placebo  0.4500000  0.8833333          -0.4333333   0.18777778        6
## anxifree 0.7166667  0.8833333          -0.1666667   0.02777778        6
## joyzepam 1.4833333  0.8833333           0.6000000   0.36000000        6
##          wt.squared.devs
## placebo        1.1266667
## anxifree       0.1666667
## joyzepam       2.1600000
SSb <- sum(wt.squared.devs)
SSb
## [1] 3.453333
# calculating df (G = 3, N = 18)
df_b <- 3 - 1
df_w <- 18 - 3

# calculating the mean square values
MS_b <- 3.45 / df_b
MS_w <- 1.39 / df_w

# calculating F-value
F <- MS_b / MS_w
F
## [1] 18.61511
# calculating p-value
pf(F, df1 = 2, df2 = 15, lower.tail = F)
## [1] 0.9999136
# calculating ANOVA with aov()
my.anova <- aov(mood.gain ~ drug, clin.trial)
my.anova
## Call:
##    aov(formula = mood.gain ~ drug, data = clin.trial)
## 
## Terms:
##                     drug Residuals
## Sum of Squares  3.453333  1.391667
## Deg. of Freedom        2        15
## 
## Residual standard error: 0.3045944
## Estimated effects may be unbalanced
summary(my.anova)
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## drug         2  3.453  1.7267   18.61 0.0000865 ***
## Residuals   15  1.392  0.0928                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# calculating the effect size (eta squared)
SStot <- SSb + SSw
eta.squared <- SSb / SStot
eta.squared
## [1] 0.7127623
# or
etaSquared(x = my.anova)
##         eta.sq eta.sq.part
## drug 0.7127623   0.7127623

5.5.1 Multiple comparisons and post hoc tests

# the tidy way
placebo <- clin.trial |>
filter(drug == "placebo") |>
dplyr::select(mood.gain)
placebo <- placebo[, 1]

anxifree <- clin.trial |>
filter(drug == "anxifree") |>
dplyr::select(mood.gain)
anxifree <- anxifree[, 1]

joyzepam <- clin.trial |>
filter(drug == "joyzepam") |>
dplyr::select(mood.gain)
joyzepam <- joyzepam[, 1]

# the non-tidy way
joyzepam <- with(clin.trial, mood.gain[drug == "joyzepam"]) # mood change due to joyzepam
anxifree <- with(clin.trial, mood.gain[drug == "anxifree"]) # mood change due to anxifree
placebo <- with(clin.trial, mood.gain[drug == "placebo"]) # mood change due to placebo

# running "pairwise" t-tests
t.test(anxifree, placebo, var.equal = TRUE) # Student t-test
## 
##  Two Sample t-test
## 
## data:  anxifree and placebo
## t = 1.3542, df = 10, p-value = 0.2055
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1721001  0.7054334
## sample estimates:
## mean of x mean of y 
## 0.7166667 0.4500000
# or
t.test(
  formula = mood.gain ~ drug,
  data = clin.trial,
  subset = drug %in% c("placebo", "anxifree"),
  var.equal = TRUE
)
## 
##  Two Sample t-test
## 
## data:  mood.gain by drug
## t = -1.3542, df = 10, p-value = 0.2055
## alternative hypothesis: true difference in means between group placebo and group anxifree is not equal to 0
## 95 percent confidence interval:
##  -0.7054334  0.1721001
## sample estimates:
##  mean in group placebo mean in group anxifree 
##              0.4500000              0.7166667
# or
pairwise.t.test(
  x = clin.trial$mood.gain, # outcome variable
  g = clin.trial$drug, # grouping variable
  p.adjust.method = "none" # which correction to use
)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  clin.trial$mood.gain and clin.trial$drug 
## 
##          placebo anxifree
## anxifree 0.15021 -       
## joyzepam 0.00003 0.00056 
## 
## P value adjustment method: none
# or
lsr::posthocPairwiseT(x = my.anova, p.adjust.method = "none") # calling pairwise.t.test using an aov object
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  mood.gain and drug 
## 
##          placebo anxifree
## anxifree 0.15021 -       
## joyzepam 0.00003 0.00056 
## 
## P value adjustment method: none
# corrections for multiple testing
# Bonferroni corrections
lsr::posthocPairwiseT(my.anova, p.adjust.method = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  mood.gain and drug 
## 
##          placebo  anxifree
## anxifree 0.4506   -       
## joyzepam 0.000091 0.0017  
## 
## P value adjustment method: bonferroni
# Holm correction
lsr::posthocPairwiseT(my.anova)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  mood.gain and drug 
## 
##          placebo  anxifree
## anxifree 0.1502   -       
## joyzepam 0.000091 0.0011  
## 
## P value adjustment method: holm

5.5.2 Checking the homogeneity of variance assumption

# running the Levene's test
car::leveneTest(y = my.anova, center = mean) # mean
## Levene's Test for Homogeneity of Variance (center = mean)
##       Df F value Pr(>F)
## group  2  1.4497 0.2657
##       15
# or
car::leveneTest(y = mood.gain ~ drug, data = clin.trial) # y is a formula in this case
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.4672 0.2618
##       15
# or
car::leveneTest(y = clin.trial$mood.gain, group = clin.trial$drug) # y is the outcome
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.4672 0.2618
##       15
# running the Brown-Forsythe test (default)
car::leveneTest(my.anova) # median
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.4672 0.2618
##       15

5.5.3 Removing the homogeneity of variance assumption

# running the Welch one-way test
oneway.test(mood.gain ~ drug, data = clin.trial)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  mood.gain and drug
## F = 26.322, num df = 2.0000, denom df = 9.4932, p-value = 0.000134

5.5.4 Checking the normality assumption

# extract the residuals
my.anova.residuals <- residuals(object = my.anova)
my.anova.residuals
##           1           2           3           4           5           6 
##  0.05000000 -0.15000000 -0.35000000 -0.11666667 -0.31666667 -0.51666667 
##           7           8           9          10          11          12 
## -0.08333333  0.21666667 -0.18333333  0.15000000  0.45000000 -0.15000000 
##          13          14          15          16          17          18 
##  0.38333333  0.08333333  0.48333333  0.31666667 -0.18333333 -0.08333333
# plotting the results to check normality
hist(x = my.anova.residuals)

qqnorm(y = my.anova.residuals)

shapiro.test(x = my.anova.residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  my.anova.residuals
## W = 0.96019, p-value = 0.6053

5.5.5 Removing the normality assumption

# running the Kruskal-Wallis test
kruskal.test(mood.gain ~ drug, data = clin.trial)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  mood.gain by drug
## Kruskal-Wallis chi-squared = 12.076, df = 2, p-value = 0.002386
# or
kruskal.test(x = clin.trial$mood.gain, g = clin.trial$drug)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  clin.trial$mood.gain and clin.trial$drug
## Kruskal-Wallis chi-squared = 12.076, df = 2, p-value = 0.002386
# or
mood.gain <- list(placebo, joyzepam, anxifree)
kruskal.test(x = mood.gain)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  mood.gain
## Kruskal-Wallis chi-squared = 12.076, df = 2, p-value = 0.002386

5.6 Linear regression

# parameter to function drawBasicScatterplot
emphGrey <- grey(.5)

# plotting scatterplots (directly)
# plot(x = parenthood$dan.sleep, y = parenthood$dan.grump,
#      xlab = "Dan Sleep (hours)",
#      ylab = "Dan Grumpiness (0-100)")
# plot(x = parenthood$baby.sleep, y = parenthood$dan.grump)

# function to plot scatterplots
drawBasicScatterplot <- function(dotcol, title) {
  plot(parenthood$dan.sleep,
    parenthood$dan.grump,
    xlab = "Dan Sleep (hours)",
    ylab = "Dan Grumpiness (0-100)",
    col = dotcol,
    main = title,
    font.main = 1,
    pch = 19
  )
}

drawBasicScatterplot(emphGrey, "")

# plotting a depiction of the residuals associated with the best fitting regression line
# good regression line
drawBasicScatterplot(emphGrey, "Regression Line Close to the Data")
good.coef <- lm(dan.grump ~ dan.sleep, parenthood)$coef
abline(good.coef, col = ifelse(colour, emphCol, "black"), lwd = 3)
for (i in seq_along(parenthood$dan.sleep)) {
  xval <- parenthood$dan.sleep[i] * c(1, 1)
  yval <- c(parenthood$dan.grump[i], good.coef[1] + good.coef[2] * parenthood$dan.sleep[i])
  lines(xval, yval, type = "l", col = emphGrey)
}

# bad regression line
drawBasicScatterplot(emphGrey, "Regression Line Distant from the Data")
bad.coef <- c(80, -3)
abline(bad.coef, col = ifelse(colour, emphCol, "black"), lwd = 3)
for (i in seq_along(parenthood$dan.sleep)) {
  xval <- parenthood$dan.sleep[i] * c(1, 1)
  yval <- c(parenthood$dan.grump[i], bad.coef[1] + bad.coef[2] * parenthood$dan.sleep[i])
  lines(xval, yval, type = "l", col = emphGrey)
}

# running a simple regression model
regression.1 <- lm(dan.grump ~ dan.sleep, data = parenthood)
regression.1
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937
# running a multiple linear regression
regression.2 <- lm(dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
regression.2
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep   baby.sleep  
##   125.96557     -8.95025      0.01052

5.6.1 Quantifying the fit of the regression model

# The R-squared value
X <- parenthood$dan.sleep # the predictor
Y <- parenthood$dan.grump # the outcome
Y.pred <- -8.94 * X + 125.97

# calculating the sum of squared residuals
SS.resid <- sum((Y - Y.pred)^2)
SS.resid
## [1] 1838.722
# calculating the total sum of squares
SS.tot <- sum((Y - mean(Y))^2)
SS.tot
## [1] 9998.59
# calculating the coefficient of determination or R-squared
R.squared <- 1 - (SS.resid / SS.tot)
R.squared
## [1] 0.8161018
# relation between regression and correlation
# calculation correlation (r-Pearson correlation coefficient)
r <- cor(X, Y)
r^2
## [1] 0.8161027

5.6.2 Hypothesis test for regression models

# compute all the quantities of the regression model
summary(regression.2)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0345  -2.2198  -0.4016   2.6775  11.7496 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 125.96557    3.04095  41.423 <0.0000000000000002 ***
## dan.sleep    -8.95025    0.55346 -16.172 <0.0000000000000002 ***
## baby.sleep    0.01052    0.27106   0.039               0.969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.354 on 97 degrees of freedom
## Multiple R-squared:  0.8161, Adjusted R-squared:  0.8123 
## F-statistic: 215.2 on 2 and 97 DF,  p-value: < 0.00000000000000022

5.6.3 Hypothesis test for a single correlation

# running t-test on a coefficient in a regression model
summary(regression.1)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.025  -2.213  -0.399   2.681  11.750 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 125.9563     3.0161   41.76 <0.0000000000000002 ***
## dan.sleep    -8.9368     0.4285  -20.85 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.332 on 98 degrees of freedom
## Multiple R-squared:  0.8161, Adjusted R-squared:  0.8142 
## F-statistic: 434.9 on 1 and 98 DF,  p-value: < 0.00000000000000022
# is the same
cor.test(x = parenthood$dan.sleep, y = parenthood$dan.grump)
## 
##  Pearson's product-moment correlation
## 
## data:  parenthood$dan.sleep and parenthood$dan.grump
## t = -20.854, df = 98, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9340614 -0.8594714
## sample estimates:
##       cor 
## -0.903384

5.6.4 Regarding regression coefficients

# confidence intervals for the coefficients
confint(object = regression.2, level = .99)
##                   0.5 %      99.5 %
## (Intercept) 117.9755724 133.9555593
## dan.sleep   -10.4044419  -7.4960575
## baby.sleep   -0.7016868   0.7227357
# calculating standardised regression coefficients
lsr::standardCoefs(regression.2)
##                      b        beta
## dan.sleep  -8.95024973 -0.90474809
## baby.sleep  0.01052447  0.00217223

5.6.5 Model checking

# Residuals
# getting ordinary residuals
residuals(object = regression.2)
##           1           2           3           4           5           6 
##  -2.1403095   4.7081942   1.9553640  -2.0602806   0.7194888  -0.4066133 
##           7           8           9          10          11          12 
##   0.2269987  -1.7003077   0.2025039   3.8524589   3.9986291  -4.9120150 
##          13          14          15          16          17          18 
##   1.2060134   0.4946578  -2.6579276  -0.3966805   3.3538613   1.7261225 
##          19          20          21          22          23          24 
##  -0.4922551  -5.6405941  -0.4660764   2.7238389   9.3653697   0.2841513 
##          25          26          27          28          29          30 
##  -0.5037668  -1.4941146   8.1328623   1.9787316  -1.5126726   3.5171148 
##          31          32          33          34          35          36 
##  -8.9256951  -2.8282946   6.1030349  -7.5460717   4.5572128 -10.6510836 
##          37          38          39          40          41          42 
##  -5.6931846   6.3096506  -2.1082466  -0.5044253   0.1875576   4.8094841 
##          43          44          45          46          47          48 
##  -5.4135163  -6.2292842  -4.5725232  -5.3354601   3.9950111   2.1718745 
##          49          50          51          52          53          54 
##  -3.4766440   0.4834367   6.2839790   2.0109396  -1.5846631  -2.2166613 
##          55          56          57          58          59          60 
##   2.2033140   1.9328736  -1.8301204  -1.5401430   2.5298509  -3.3705782 
##          61          62          63          64          65          66 
##  -2.9380806   0.6590736  -0.5917559  -8.6131971   5.9781035   5.9332979 
##          67          68          69          70          71          72 
##  -1.2341956   3.0047669  -1.0802468   6.5174672  -3.0155469   2.1176720 
##          73          74          75          76          77          78 
##   0.6058757  -2.7237421  -2.2291472  -1.4053822   4.7461491  11.7495569 
##          79          80          81          82          83          84 
##   4.7634141   2.6620908 -11.0345292  -0.7588667   1.4558227  -0.4745727 
##          85          86          87          88          89          90 
##   8.9091201  -1.1409777   0.7555223  -0.4107130   0.8797237  -1.4095586 
##          91          92          93          94          95          96 
##   3.1571385  -3.4205757  -5.7228699  -2.2033958  -3.8647891   0.4982711 
##          97          98          99         100 
##  -5.5249495   4.1134221  -8.2038533   5.6800859
# getting standardised residuals
rstandard(model = regression.2)
##           1           2           3           4           5           6 
## -0.49675845  1.10430571  0.46361264 -0.47725357  0.16756281 -0.09488969 
##           7           8           9          10          11          12 
##  0.05286626 -0.39260381  0.04739691  0.89033990  0.95851248 -1.13898701 
##          13          14          15          16          17          18 
##  0.28047841  0.11519184 -0.61657092 -0.09191865  0.77692937  0.40403495 
##          19          20          21          22          23          24 
## -0.11552373 -1.31540412 -0.10819238  0.62951824  2.17129803  0.06586227 
##          25          26          27          28          29          30 
## -0.11980449 -0.34704024  1.91121833  0.45686516 -0.34986350  0.81233165 
##          31          32          33          34          35          36 
## -2.08659993 -0.66317843  1.42930082 -1.77763064  1.07452436 -2.47385780 
##          37          38          39          40          41          42 
## -1.32715114  1.49419658 -0.49115639 -0.11674947  0.04401233  1.11881912 
##          43          44          45          46          47          48 
## -1.27081641 -1.46422595 -1.06943700 -1.24659673  0.94152881  0.51069809 
##          49          50          51          52          53          54 
## -0.81373349  0.11412178  1.47938594  0.46437962 -0.37157009 -0.51609949 
##          55          56          57          58          59          60 
##  0.51800753  0.44813204 -0.42662358 -0.35575611  0.58403297 -0.78022677 
##          61          62          63          64          65          66 
## -0.67833325  0.15484699 -0.13760574 -2.05662232  1.40238029  1.37505125 
##          67          68          69          70          71          72 
## -0.28964989  0.69497632 -0.24945316  1.50709623 -0.69864682  0.49071427 
##          73          74          75          76          77          78 
##  0.14267297 -0.63246560 -0.51972828 -0.32509811  1.10842574  2.72171671 
##          79          80          81          82          83          84 
##  1.09975101  0.62057080 -2.55172097 -0.17584803  0.34340064 -0.11158952 
##          85          86          87          88          89          90 
##  2.10863391 -0.26386516  0.17624445 -0.09504416  0.20450884 -0.32730740 
##          91          92          93          94          95          96 
##  0.73475640 -0.79400855 -1.32768248 -0.51940736 -0.91512580  0.11661226 
##          97          98          99         100 
## -1.28069115  0.96332849 -1.90290258  1.31368144
# getting Studentised residuals
rstudent(model = regression.2)
##           1           2           3           4           5           6 
## -0.49482102  1.10557030  0.46172854 -0.47534555  0.16672097 -0.09440368 
##           7           8           9          10          11          12 
##  0.05259381 -0.39088553  0.04715251  0.88938019  0.95810710 -1.14075472 
##          13          14          15          16          17          18 
##  0.27914212  0.11460437 -0.61459001 -0.09144760  0.77533036  0.40228555 
##          19          20          21          22          23          24 
## -0.11493461 -1.32043609 -0.10763974  0.62754813  2.21456485  0.06552336 
##          25          26          27          28          29          30 
## -0.11919416 -0.34546127  1.93818473  0.45499388 -0.34827522  0.81089646 
##          31          32          33          34          35          36 
## -2.12403286 -0.66125192  1.43712830 -1.79797263  1.07539064 -2.54258876 
##          37          38          39          40          41          42 
## -1.33244515  1.50388257 -0.48922682 -0.11615428  0.04378531  1.12028904 
##          43          44          45          46          47          48 
## -1.27490649 -1.47302872 -1.07023828 -1.25020935  0.94097261  0.50874322 
##          49          50          51          52          53          54 
## -0.81230544  0.11353962  1.48863006  0.46249410 -0.36991317 -0.51413868 
##          55          56          57          58          59          60 
##  0.51604474  0.44627831 -0.42481754 -0.35414868  0.58203894 -0.77864171 
##          61          62          63          64          65          66 
## -0.67643392  0.15406579 -0.13690795 -2.09211556  1.40949469  1.38147541 
##          67          68          69          70          71          72 
## -0.28827768  0.69311245 -0.24824363  1.51717578 -0.69679156  0.48878534 
##          73          74          75          76          77          78 
##  0.14195054 -0.63049841 -0.51776374 -0.32359434  1.10974786  2.81736616 
##          79          80          81          82          83          84 
##  1.10095270  0.61859288 -2.62827967 -0.17496714  0.34183379 -0.11101996 
##          85          86          87          88          89          90 
##  2.14753375 -0.26259576  0.17536170 -0.09455738  0.20349582 -0.32579584 
##          91          92          93          94          95          96 
##  0.73300184 -0.79248469 -1.33298848 -0.51744314 -0.91435205  0.11601774 
##          97          98          99         100 
## -1.28498273  0.96296745 -1.92942389  1.31867548
# Anomalous data
# getting hat values
hatvalues(model = regression.2)
##          1          2          3          4          5          6          7 
## 0.02067452 0.04105320 0.06155445 0.01685226 0.02734865 0.03129943 0.02735579 
##          8          9         10         11         12         13         14 
## 0.01051224 0.03698976 0.01229155 0.08189763 0.01882551 0.02462902 0.02718388 
##         15         16         17         18         19         20         21 
## 0.01964210 0.01748592 0.01691392 0.03712530 0.04213891 0.02994643 0.02099435 
##         22         23         24         25         26         27         28 
## 0.01233280 0.01853370 0.01804801 0.06722392 0.02214927 0.04472007 0.01039447 
##         29         30         31         32         33         34         35 
## 0.01381812 0.01105817 0.03468260 0.04048248 0.03814670 0.04934440 0.05107803 
##         36         37         38         39         40         41         42 
## 0.02208177 0.02919013 0.05928178 0.02799695 0.01519967 0.04195751 0.02514137 
##         43         44         45         46         47         48         49 
## 0.04267879 0.04517340 0.03558080 0.03360160 0.05019778 0.04587468 0.03701290 
##         50         51         52         53         54         55         56 
## 0.05331282 0.04814477 0.01072699 0.04047386 0.02681315 0.04556787 0.01856997 
##         57         58         59         60         61         62         63 
## 0.02919045 0.01126069 0.01012683 0.01546412 0.01029534 0.04428870 0.02438944 
##         64         65         66         67         68         69         70 
## 0.07469673 0.04135090 0.01775697 0.04217616 0.01384321 0.01069005 0.01340216 
##         71         72         73         74         75         76         77 
## 0.01716361 0.01751844 0.04863314 0.02158623 0.02951418 0.01411915 0.03276064 
##         78         79         80         81         82         83         84 
## 0.01684599 0.01028001 0.02920514 0.01348051 0.01752758 0.05184527 0.04583604 
##         85         86         87         88         89         90         91 
## 0.05825858 0.01359644 0.03054414 0.01487724 0.02381348 0.02159418 0.02598661 
##         92         93         94         95         96         97         98 
## 0.02093288 0.01982480 0.05063492 0.05907629 0.03682026 0.01817919 0.03811718 
##         99        100 
## 0.01945603 0.01373394
# getting Cook's distance values
cooks.distance(model = regression.2)
##             1             2             3             4             5 
## 0.00173651171 0.01740242936 0.00469937006 0.00130141703 0.00026315569 
##             6             7             8             9            10 
## 0.00009697585 0.00002620181 0.00054584906 0.00002876269 0.00328827681 
##            11            12            13            14            15 
## 0.02731835260 0.00829691897 0.00066214792 0.00012359557 0.00253891456 
##            16            17            18            19            20 
## 0.00005012283 0.00346174151 0.00209805468 0.00019570497 0.01780518526 
##            21            22            23            24            25 
## 0.00008367377 0.00164947783 0.02967593755 0.00002657610 0.00034480325 
##            26            27            28            29            30 
## 0.00090933792 0.05699951226 0.00073079434 0.00057169976 0.00245956350 
##            31            32            33            34            35 
## 0.05214331478 0.00618519973 0.02700686248 0.05467344690 0.02071643055 
##            36            37            38            39            40 
## 0.04606378152 0.01765311655 0.04689816875 0.00231612212 0.00007012530 
##            41            42            43            44            45 
## 0.00002827824 0.01076083119 0.02399930996 0.03381062226 0.01406497804 
##            46            47            48            49            50 
## 0.01801086321 0.01561698982 0.00417998637 0.00848351357 0.00024447866 
##            51            52            53            54            55 
## 0.03689945760 0.00077944724 0.00194123453 0.00244622954 0.00427036054 
##            56            57            58            59            60 
## 0.00126660900 0.00182421163 0.00048047048 0.00116318135 0.00318723481 
##            61            62            63            64            65 
## 0.00159551161 0.00037038258 0.00015778917 0.11381653127 0.02827715166 
##            66            67            68            69            70 
## 0.01139374043 0.00123142189 0.00226000648 0.00022413221 0.01028478940 
##            71            72            73            74            75 
## 0.00284132858 0.00143122276 0.00034685379 0.00294175691 0.00273824884 
##            76            77            78            79            80 
## 0.00050453567 0.01387108351 0.04230965547 0.00418744027 0.00386183094 
##            81            82            83            84            85 
## 0.02965826018 0.00018388884 0.00214936854 0.00019939290 0.09168733246 
##            86            87            88            89            90 
## 0.00031989944 0.00032621922 0.00004547383 0.00034008931 0.00078814872 
##            91            92            93            94            95 
## 0.00480120376 0.00449309454 0.01188426604 0.00479636047 0.01752665888 
##            96            97            98            99           100 
## 0.00017327933 0.01012301719 0.01225818302 0.02394963622 0.00801050779
# test from car package to see if any of the Studentised residuals are significantly larger
car::outlierTest(model = regression.2)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##    rstudent unadjusted p-value Bonferroni p
## 78 2.817366          0.0058788      0.58788
# plotting anomalous data using plot()
plot(x = regression.2, which = 4) # Cook's distance 

plot(x = regression.2, which = 5) # the Studentised residuals agains leverage

# using the car package
car::influenceIndexPlot(model = regression.2)

car::influencePlot(model = regression.2)

##       StudRes        Hat      CookD
## 11  0.9581071 0.08189763 0.02731835
## 64 -2.0921156 0.07469673 0.11381653
## 78  2.8173662 0.01684599 0.04230966
## 81 -2.6282797 0.01348051 0.02965826
## 85  2.1475338 0.05825858 0.09168733
# getting rid of outliers using subset
lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood, subset = -64)
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood, 
##     subset = -64)
## 
## Coefficients:
## (Intercept)    dan.sleep   baby.sleep  
##    126.3553      -8.8283      -0.1319
# Checking the normality of the residuals
# plotting histogram
hist(x = residuals(regression.2),
     xlab = "Value of residual",
     main = "",
     breaks = 20)

# running a Shapiro-Wilk test
shapiro.test(x = residuals(regression.2))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(regression.2)
## W = 0.99228, p-value = 0.8414
# plotting qq-plot
plot(x = regression.2, which = 2)

# Checking the linearity of the relationship
# plotting the relationship between the fitted values and the observed values
yhat.2 <- fitted.values(object = regression.2)
plot(x = yhat.2,
     y = parenthood$dan.grump,
     xlab = "Fitted Values",
     ylab = "Observed Values")

# plotting the relationship between the the fitted values and the residuals themselves
plot(x = regression.2, which = 1)

# with residualPlots() function from car package
car::residualPlots(model = regression.2)

##            Test stat Pr(>|Test stat|)  
## dan.sleep     2.1604          0.03323 *
## baby.sleep   -0.5445          0.58733  
## Tukey test    2.1615          0.03066 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Checking the homogeneity of variance
plot(x = regression.2, which = 3)

# running the non-constant variance test
car::ncvTest(regression.2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.09317511, Df = 1, p = 0.76018
# running sandwich estimators if homogeneity of variance is violated
lmtest::coeftest(regression.2, vcov. = hccm)
## 
## t test of coefficients:
## 
##               Estimate Std. Error  t value            Pr(>|t|)    
## (Intercept) 125.965566   3.247285  38.7910 <0.0000000000000002 ***
## dan.sleep    -8.950250   0.615820 -14.5339 <0.0000000000000002 ***
## baby.sleep    0.010524   0.291565   0.0361              0.9713    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Checking for collinearity
# calculating the variance inflation factors (VIFs)
car::vif(mod = regression.2)
##  dan.sleep baby.sleep 
##   1.651038   1.651038
# example of getting collinearity
regression.3 <- lm(day ~ baby.sleep + dan.sleep + dan.grump, parenthood)
car::vif(mod = regression.3)
## baby.sleep  dan.sleep  dan.grump 
##   1.651064   6.102337   5.437903

5.6.6 Model selection

# the backward elimination approach
# 1. get the complete regression model
full.model <- lm(formula = dan.grump ~ dan.sleep + baby.sleep + day, data = parenthood)

# 2. running for the first time the step() function: backward
step(object = full.model, direction = "backward")
## Start:  AIC=299.08
## dan.grump ~ dan.sleep + baby.sleep + day
## 
##              Df Sum of Sq    RSS    AIC
## - baby.sleep  1       0.1 1837.2 297.08
## - day         1       1.6 1838.7 297.16
## <none>                    1837.1 299.08
## - dan.sleep   1    4909.0 6746.1 427.15
## 
## Step:  AIC=297.08
## dan.grump ~ dan.sleep + day
## 
##             Df Sum of Sq    RSS    AIC
## - day        1       1.6 1838.7 295.17
## <none>                   1837.2 297.08
## - dan.sleep  1    8103.0 9940.1 463.92
## 
## Step:  AIC=295.17
## dan.grump ~ dan.sleep
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   1838.7 295.17
## - dan.sleep  1    8159.9 9998.6 462.50
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937
# running step() forward
null.model <- lm(dan.grump ~ 1, parenthood)
step(object = null.model, direction = "forward", scope = dan.grump ~ dan.sleep + baby.sleep + day)
## Start:  AIC=462.5
## dan.grump ~ 1
## 
##              Df Sum of Sq    RSS    AIC
## + dan.sleep   1    8159.9 1838.7 295.17
## + baby.sleep  1    3202.7 6795.9 425.89
## <none>                    9998.6 462.50
## + day         1      58.5 9940.1 463.92
## 
## Step:  AIC=295.17
## dan.grump ~ dan.sleep
## 
##              Df Sum of Sq    RSS    AIC
## <none>                    1838.7 295.17
## + day         1   1.55760 1837.2 297.08
## + baby.sleep  1   0.02858 1838.7 297.16
## 
## Call:
## lm(formula = dan.grump ~ dan.sleep, data = parenthood)
## 
## Coefficients:
## (Intercept)    dan.sleep  
##     125.956       -8.937
# comparing two regression models
# 1. running the regressions
M0 <- lm(dan.grump ~ dan.sleep + day, parenthood)
M1 <- lm(dan.grump ~ dan.sleep + day + baby.sleep, parenthood)

# A. based on a model selection criterion (AIC)
AIC(M0, M1)
##    df      AIC
## M0  4 582.8681
## M1  5 584.8646
# B. calculating the F statistic
anova(M0, M1)
## Analysis of Variance Table
## 
## Model 1: dan.grump ~ dan.sleep + day
## Model 2: dan.grump ~ dan.sleep + day + baby.sleep
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     97 1837.2                           
## 2     96 1837.1  1  0.063688 0.0033 0.9541

5.7 Comparing several means with more than one grouping variable (factorial or two-way ANOVA)

5.7.1 Factorial ANOVA 1: balanced designs

# cross-tabulating variables
xtabs(~ drug + therapy, clin.trial)
##           therapy
## drug       no.therapy CBT
##   placebo           3   3
##   anxifree          3   3
##   joyzepam          3   3
# calculating a cross-tabulation of the group means for all possible combinations
# for factor drug
aggregate(mood.gain ~ drug, clin.trial, mean)
##       drug mood.gain
## 1  placebo 0.4500000
## 2 anxifree 0.7166667
## 3 joyzepam 1.4833333
# for factor therapy
aggregate(mood.gain ~ therapy, clin.trial, mean)
##      therapy mood.gain
## 1 no.therapy 0.7222222
## 2        CBT 1.0444444
# for both
total_means <- aggregate(mood.gain ~ drug + therapy, clin.trial, mean)
# for none (the mean)
mean(clin.trial$mood.gain)
## [1] 0.8833333
# running aov using only a single factor
model.1 <- aov(mood.gain ~ drug, clin.trial)
summary(model.1)
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## drug         2  3.453  1.7267   18.61 0.0000865 ***
## Residuals   15  1.392  0.0928                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# running aov using two grouping factors
model.2 <- aov(mood.gain ~ drug + therapy, clin.trial)
summary(model.2)
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## drug         2  3.453  1.7267  26.149 0.0000187 ***
## therapy      1  0.467  0.4672   7.076    0.0187 *  
## Residuals   14  0.924  0.0660                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plotting to check interaction
sciplot::lineplot.CI(x.factor = clin.trial$drug,
            response = clin.trial$mood.gain,
            group = clin.trial$therapy,
            ci.fun = ciMean,
            xlab = "drug",
            ylab = "mood gain")

# running factorial ANOVA with interaction
model.3 <- aov(mood.gain ~ drug * therapy, clin.trial)
# or
# model.3 <- aov(mood.gain ~ drug + therapy + drug:therapy, clin.trial)
summary(model.3)
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## drug          2  3.453  1.7267  31.714 0.0000162 ***
## therapy       1  0.467  0.4672   8.582    0.0126 *  
## drug:therapy  2  0.271  0.1356   2.490    0.1246    
## Residuals    12  0.653  0.0544                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# calculating effect size
lsr::etaSquared(model.2)
##            eta.sq eta.sq.part
## drug    0.7127623   0.7888325
## therapy 0.0964339   0.3357285
# calculating effect size with interaction
lsr::etaSquared(model.3)
##                  eta.sq eta.sq.part
## drug         0.71276230   0.8409091
## therapy      0.09643390   0.4169559
## drug:therapy 0.05595689   0.2932692
# calculating estimates of all group means
eff <- effects::effect(term = "drug*therapy", mod = model.3)
eff
## 
##  drug*therapy effect
##           therapy
## drug       no.therapy      CBT
##   placebo    0.300000 0.600000
##   anxifree   0.400000 1.033333
##   joyzepam   1.466667 1.500000
# extracting confidence intervals
summary(eff)
## 
##  drug*therapy effect
##           therapy
## drug       no.therapy      CBT
##   placebo    0.300000 0.600000
##   anxifree   0.400000 1.033333
##   joyzepam   1.466667 1.500000
## 
##  Lower 95 Percent Confidence Limits
##           therapy
## drug        no.therapy       CBT
##   placebo  0.006481093 0.3064811
##   anxifree 0.106481093 0.7398144
##   joyzepam 1.173147759 1.2064811
## 
##  Upper 95 Percent Confidence Limits
##           therapy
## drug       no.therapy       CBT
##   placebo   0.5935189 0.8935189
##   anxifree  0.6935189 1.3268522
##   joyzepam  1.7601856 1.7935189
# no interaction
eff2 <- effects::effect(term = "drug*therapy", mod = model.2)
eff2
## 
##  drug*therapy effect
##           therapy
## drug       no.therapy       CBT
##   placebo   0.2888889 0.6111111
##   anxifree  0.5555556 0.8777778
##   joyzepam  1.3222222 1.6444444
summary(eff2)
## 
##  drug*therapy effect
##           therapy
## drug       no.therapy       CBT
##   placebo   0.2888889 0.6111111
##   anxifree  0.5555556 0.8777778
##   joyzepam  1.3222222 1.6444444
## 
##  Lower 95 Percent Confidence Limits
##           therapy
## drug       no.therapy       CBT
##   placebo  0.02907986 0.3513021
##   anxifree 0.29574653 0.6179687
##   joyzepam 1.06241319 1.3846354
## 
##  Upper 95 Percent Confidence Limits
##           therapy
## drug       no.therapy       CBT
##   placebo   0.5486979 0.8709201
##   anxifree  0.8153646 1.1375868
##   joyzepam  1.5820313 1.9042535

5.7.1.1 Assumption checking

# Levene test for homogeneity of variance
car::leveneTest(mood.gain ~ drug * therapy, clin.trial)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  5  0.0955 0.9912
##       12
# testing the normality of the residuals
# 1. extracting the residuals
resid <- residuals(model.2)

# 2. plotting the residuals
hist(resid)

qqnorm(resid)

# 3. running the Shapiro-Wilk test
shapiro.test(resid)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid
## W = 0.95635, p-value = 0.5329
# running F-test for model comparison
anova(model.1, model.3)
## Analysis of Variance Table
## 
## Model 1: mood.gain ~ drug
## Model 2: mood.gain ~ drug * therapy
##   Res.Df     RSS Df Sum of Sq      F  Pr(>F)  
## 1     15 1.39167                              
## 2     12 0.65333  3   0.73833 4.5204 0.02424 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.7.1.2 ANOVA as linear model

# load data
load("input/rtfm.Rdata")
rtfm.1
##   grade attend reading
## 1    90      1       1
## 2    87      1       1
## 3    75      0       1
## 4    60      1       0
## 5    35      0       0
## 6    50      0       0
## 7    65      1       0
## 8    70      0       1
rtfm.2
##   grade attend reading
## 1    90    yes     yes
## 2    87    yes     yes
## 3    75     no     yes
## 4    60    yes      no
## 5    35     no      no
## 6    50     no      no
## 7    65    yes      no
## 8    70     no     yes
# running an ANOVA analysis
anova.model <- aov(grade ~ attend + reading, data = rtfm.2)
summary(anova.model)
##             Df Sum Sq Mean Sq F value  Pr(>F)    
## attend       1    648     648   21.60 0.00559 ** 
## reading      1   1568    1568   52.27 0.00079 ***
## Residuals    5    150      30                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# taking notes of F statistics (the observed group means)
effects::Effect(c("attend", "reading"), anova.model)
## 
##  attend*reading effect
##       reading
## attend   no  yes
##    no  43.5 71.5
##    yes 61.5 89.5
# running a regression
regression.model <- lm(grade ~ attend + reading, data = rtfm.1)
coef(summary(regression.model))
##             Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)     43.5   3.354102 12.969194 0.00004858084
## attend          18.0   3.872983  4.647580 0.00559431256
## reading         28.0   3.872983  7.229569 0.00078994250
# running ANOVA function to a regression model
anova(regression.model)
## Analysis of Variance Table
## 
## Response: grade
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## attend     1    648     648  21.600 0.0055943 ** 
## reading    1   1568    1568  52.267 0.0007899 ***
## Residuals  5    150      30                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# encoding non binary factors as contrasts
clin.trial$druganxifree <- as.numeric(clin.trial$drug == "anxifree")
clin.trial$drugjoyzepam <- as.numeric(clin.trial$drug == "joyzepam")
# or using the expandFactors() function
clin.trial2 <- lsr::expandFactors(clin.trial)

# running the ANOVA model in a three-level factor data frame
drug.model <- aov(mood.gain ~ drug + therapy, data = clin.trial)
summary(drug.model)
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## drug         2  3.453  1.7267  26.149 0.0000187 ***
## therapy      1  0.467  0.4672   7.076    0.0187 *  
## Residuals   14  0.924  0.0660                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# running a regression model in a two binary contrasts data frame
drug.regression <- lm(mood.gain ~ druganxifree + drugjoyzepam + therapyCBT, data = clin.trial2)
summary(drug.regression)
## 
## Call:
## lm(formula = mood.gain ~ druganxifree + drugjoyzepam + therapyCBT, 
##     data = clin.trial2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3556 -0.1806  0.0000  0.1972  0.3778 
## 
## Coefficients:
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)    0.2889     0.1211   2.385    0.0318 *  
## druganxifree   0.2667     0.1484   1.797    0.0939 .  
## drugjoyzepam   1.0333     0.1484   6.965 0.0000066 ***
## therapyCBT     0.3222     0.1211   2.660    0.0187 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.257 on 14 degrees of freedom
## Multiple R-squared:  0.8092, Adjusted R-squared:  0.7683 
## F-statistic: 19.79 on 3 and 14 DF,  p-value: 0.0000264
# running the F-test
nodrug.regression <- lm(mood.gain ~ therapyCBT, data = clin.trial2)
anova(nodrug.regression, drug.regression)
## Analysis of Variance Table
## 
## Model 1: mood.gain ~ therapyCBT
## Model 2: mood.gain ~ druganxifree + drugjoyzepam + therapyCBT
##   Res.Df    RSS Df Sum of Sq      F     Pr(>F)    
## 1     16 4.3778                                   
## 2     14 0.9244  2    3.4533 26.149 0.00001872 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# running a regression model in a three-level factor data frame
drug.lm <- lm(mood.gain ~ drug + therapy, data = clin.trial)
summary(drug.lm)
## 
## Call:
## lm(formula = mood.gain ~ drug + therapy, data = clin.trial)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3556 -0.1806  0.0000  0.1972  0.3778 
## 
## Coefficients:
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)    0.2889     0.1211   2.385    0.0318 *  
## druganxifree   0.2667     0.1484   1.797    0.0939 .  
## drugjoyzepam   1.0333     0.1484   6.965 0.0000066 ***
## therapyCBT     0.3222     0.1211   2.660    0.0187 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.257 on 14 degrees of freedom
## Multiple R-squared:  0.8092, Adjusted R-squared:  0.7683 
## F-statistic: 19.79 on 3 and 14 DF,  p-value: 0.0000264
# using anova() gives us an ANOVA table
anova(drug.lm)
## Analysis of Variance Table
## 
## Response: mood.gain
##           Df Sum Sq Mean Sq F value     Pr(>F)    
## drug       2 3.4533 1.72667 26.1490 0.00001872 ***
## therapy    1 0.4672 0.46722  7.0757    0.01866 *  
## Residuals 14 0.9244 0.06603                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# creating treatment contrasts
contr.treatment(n = 5)
##   2 3 4 5
## 1 0 0 0 0
## 2 1 0 0 0
## 3 0 1 0 0
## 4 0 0 1 0
## 5 0 0 0 1
# last category treated as the baseline
contr.SAS(n = 5)
##   1 2 3 4
## 1 1 0 0 0
## 2 0 1 0 0
## 3 0 0 1 0
## 4 0 0 0 1
## 5 0 0 0 0
# or
contr.treatment(n = 5, base = 5)
##   1 2 3 4
## 1 1 0 0 0
## 2 0 1 0 0
## 3 0 0 1 0
## 4 0 0 0 1
## 5 0 0 0 0
# creating a contrast matrix with Helmert contrasts
contr.helmert(n = 5)
##   [,1] [,2] [,3] [,4]
## 1   -1   -1   -1   -1
## 2    1   -1   -1   -1
## 3    0    2   -1   -1
## 4    0    0    3   -1
## 5    0    0    0    4
# creating "sum to zero" matrix
contr.sum( n = 5)
##   [,1] [,2] [,3] [,4]
## 1    1    0    0    0
## 2    0    1    0    0
## 3    0    0    1    0
## 4    0    0    0    1
## 5   -1   -1   -1   -1
# checking contrasts matrix for a df
attr(clin.trial$drug, "contrasts")
## NULL
contrasts(clin.trial$drug) # per default: contr.treatment()
##          anxifree joyzepam
## placebo         0        0
## anxifree        1        0
## joyzepam        0        1
# setting the contrasts for a single factor
contrasts(clin.trial$drug) <- contr.sum(n = 3)
contrasts(clin.trial$drug)
##          [,1] [,2]
## placebo     1    0
## anxifree    0    1
## joyzepam   -1   -1
contrasts(clin.trial$drug) <- NULL # revert the defaults
contrasts(clin.trial$drug)
##          anxifree joyzepam
## placebo         0        0
## anxifree        1        0
## joyzepam        0        1
# setting the contrasts for a single analysis
# 1. creating a list with the contrasts types
my.contrasts <- list(drug = contr.helmert, therapy = contr.helmert)

# 2. fitting the ANOVA model with the contrasts argument specified
mod <- aov(mood.gain ~ drug * therapy, clin.trial, contrasts = my.contrasts)

# 3. checking
mod$contrasts
## $drug
##          [,1] [,2]
## placebo    -1   -1
## anxifree    1   -1
## joyzepam    0    2
## 
## $therapy
##            [,1]
## no.therapy   -1
## CBT           1

5.7.1.3 Post hoc tests

# running Tukey's HSD
TukeyHSD(model.2)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = mood.gain ~ drug + therapy, data = clin.trial)
## 
## $drug
##                        diff        lwr       upr     p adj
## anxifree-placebo  0.2666667 -0.1216321 0.6549655 0.2062942
## joyzepam-placebo  1.0333333  0.6450345 1.4216321 0.0000186
## joyzepam-anxifree 0.7666667  0.3783679 1.1549655 0.0003934
## 
## $therapy
##                     diff       lwr       upr     p adj
## CBT-no.therapy 0.3222222 0.0624132 0.5820312 0.0186602
# with interactions
TukeyHSD(model.3)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = mood.gain ~ drug * therapy, data = clin.trial)
## 
## $drug
##                        diff         lwr       upr     p adj
## anxifree-placebo  0.2666667 -0.09273475 0.6260681 0.1597148
## joyzepam-placebo  1.0333333  0.67393191 1.3927348 0.0000160
## joyzepam-anxifree 0.7666667  0.40726525 1.1260681 0.0002740
## 
## $therapy
##                     diff        lwr       upr    p adj
## CBT-no.therapy 0.3222222 0.08256504 0.5618794 0.012617
## 
## $`drug:therapy`
##                                                diff          lwr        upr
## anxifree:no.therapy-placebo:no.therapy   0.10000000 -0.539927728  0.7399277
## joyzepam:no.therapy-placebo:no.therapy   1.16666667  0.526738939  1.8065944
## placebo:CBT-placebo:no.therapy           0.30000000 -0.339927728  0.9399277
## anxifree:CBT-placebo:no.therapy          0.73333333  0.093405606  1.3732611
## joyzepam:CBT-placebo:no.therapy          1.20000000  0.560072272  1.8399277
## joyzepam:no.therapy-anxifree:no.therapy  1.06666667  0.426738939  1.7065944
## placebo:CBT-anxifree:no.therapy          0.20000000 -0.439927728  0.8399277
## anxifree:CBT-anxifree:no.therapy         0.63333333 -0.006594394  1.2732611
## joyzepam:CBT-anxifree:no.therapy         1.10000000  0.460072272  1.7399277
## placebo:CBT-joyzepam:no.therapy         -0.86666667 -1.506594394 -0.2267389
## anxifree:CBT-joyzepam:no.therapy        -0.43333333 -1.073261061  0.2065944
## joyzepam:CBT-joyzepam:no.therapy         0.03333333 -0.606594394  0.6732611
## anxifree:CBT-placebo:CBT                 0.43333333 -0.206594394  1.0732611
## joyzepam:CBT-placebo:CBT                 0.90000000  0.260072272  1.5399277
## joyzepam:CBT-anxifree:CBT                0.46666667 -0.173261061  1.1065944
##                                             p adj
## anxifree:no.therapy-placebo:no.therapy  0.9940083
## joyzepam:no.therapy-placebo:no.therapy  0.0005667
## placebo:CBT-placebo:no.therapy          0.6280049
## anxifree:CBT-placebo:no.therapy         0.0218746
## joyzepam:CBT-placebo:no.therapy         0.0004380
## joyzepam:no.therapy-anxifree:no.therapy 0.0012553
## placebo:CBT-anxifree:no.therapy         0.8917157
## anxifree:CBT-anxifree:no.therapy        0.0529812
## joyzepam:CBT-anxifree:no.therapy        0.0009595
## placebo:CBT-joyzepam:no.therapy         0.0067639
## anxifree:CBT-joyzepam:no.therapy        0.2750590
## joyzepam:CBT-joyzepam:no.therapy        0.9999703
## anxifree:CBT-placebo:CBT                0.2750590
## joyzepam:CBT-placebo:CBT                0.0050693
## joyzepam:CBT-anxifree:CBT               0.2139229

5.7.2 Factorial ANOVA 2: unbalanced designs

# load data
load("input/coffee.Rdata")

# calculating means between different groups
aggregate(babble ~ milk + sugar, data = coffee, FUN = mean)
##   milk sugar babble
## 1  yes  none  3.700
## 2   no  none  5.550
## 3  yes  fake  5.800
## 4   no  fake  4.650
## 5  yes  real  5.100
## 6   no  real  5.875
# calculating sd between different groups
aggregate(babble ~ milk + sugar, data = coffee, FUN = sd)
##   milk sugar    babble
## 1  yes  none 0.2000000
## 2   no  none 0.3535534
## 3  yes  fake 0.1414214
## 4   no  fake 0.7141428
## 5  yes  real 0.5000000
## 6   no  real 0.5500000
# checking how many observations in each group
xtabs(~ milk + sugar, data = coffee)
##      sugar
## milk  none fake real
##   yes    3    2    3
##   no     2    4    4
# running Type I sum of squares
mod.type1 <- lm(babble ~ sugar + milk + sugar:milk, data = coffee)
anova(mod.type1)
## Analysis of Variance Table
## 
## Response: babble
##            Df Sum Sq Mean Sq F value   Pr(>F)   
## sugar       2 3.5575 1.77876  6.7495 0.010863 * 
## milk        1 0.9561 0.95611  3.6279 0.081061 . 
## sugar:milk  2 5.9439 2.97193 11.2769 0.001754 **
## Residuals  12 3.1625 0.26354                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# or
mod.1 <- lm(babble ~ 1, data = coffee)
mod.2 <- lm(babble ~ sugar, data = coffee)
mod.3 <- lm(babble ~ sugar + milk, data = coffee)
mod.4 <- lm(babble ~ sugar + milk + sugar:milk, data = coffee)
anova(mod.1, mod.2, mod.3, mod.4)
## Analysis of Variance Table
## 
## Model 1: babble ~ 1
## Model 2: babble ~ sugar
## Model 3: babble ~ sugar + milk
## Model 4: babble ~ sugar + milk + sugar:milk
##   Res.Df     RSS Df Sum of Sq       F   Pr(>F)   
## 1     17 13.6200                                 
## 2     15 10.0625  2    3.5575  6.7495 0.010863 * 
## 3     14  9.1064  1    0.9561  3.6279 0.081061 . 
## 4     12  3.1625  2    5.9439 11.2769 0.001754 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# the order matters
mod_order <- lm(babble ~ milk + sugar + sugar:milk, data = coffee)
anova(mod_order)
## Analysis of Variance Table
## 
## Response: babble
##            Df Sum Sq Mean Sq F value   Pr(>F)   
## milk        1 1.4440 1.44400  5.4792 0.037333 * 
## sugar       2 3.0696 1.53482  5.8238 0.017075 * 
## milk:sugar  2 5.9439 2.97193 11.2769 0.001754 **
## Residuals  12 3.1625 0.26354                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# running Type III sum of squares (default contrasts = treatment contrasts)
mod.type3 <- lm(babble ~ sugar * milk, data = coffee)
car::Anova(mod.type3, type = 3)
## Anova Table (Type III tests)
## 
## Response: babble
##             Sum Sq Df F value       Pr(>F)    
## (Intercept) 41.070  1 155.839 0.0000000311 ***
## sugar        5.880  2  11.156     0.001830 ** 
## milk         4.107  1  15.584     0.001936 ** 
## sugar:milk   5.944  2  11.277     0.001754 ** 
## Residuals    3.162 12                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# changing the contrasts
my.contrasts.type3 <- list(milk = "contr.Helmert", sugar = "contr.Helmert")
mod.H <- lm(babble ~ sugar * milk, data = coffee, contrasts = my.contrasts.type3)
car::Anova(mod.H, type = 3)
## Anova Table (Type III tests)
## 
## Response: babble
##             Sum Sq Df   F value              Pr(>F)    
## (Intercept) 434.29  1 1647.8882 0.00000000000003231 ***
## sugar         2.13  2    4.0446            0.045426 *  
## milk          1.00  1    3.8102            0.074672 .  
## sugar:milk    5.94  2   11.2769            0.001754 ** 
## Residuals     3.16 12                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# running Type II sum of squares
mod.type2 <- lm(babble ~ sugar * milk, data = coffee)
car::Anova(mod.type2, type = 2)
## Anova Table (Type II tests)
## 
## Response: babble
##            Sum Sq Df F value   Pr(>F)   
## sugar      3.0696  2  5.8238 0.017075 * 
## milk       0.9561  1  3.6279 0.081061 . 
## sugar:milk 5.9439  2 11.2769 0.001754 **
## Residuals  3.1625 12                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# calculating eta squares and partial eta squares for unbalanced designs and for different Types of tests
es <- lsr::etaSquared(mod.type1, type = 2, anova = TRUE)
es
##                eta.sq eta.sq.part        SS df        MS         F           p
## sugar      0.22537682   0.4925493 3.0696323  2 1.5348161  5.823808 0.017075099
## milk       0.07019886   0.2321436 0.9561085  1 0.9561085  3.627921 0.081060698
## sugar:milk 0.43640732   0.6527155 5.9438677  2 2.9719339 11.276903 0.001754333
## Residuals  0.23219530          NA 3.1625000 12 0.2635417        NA          NA

5.8 Bayesian statistics

5.8.1 Bayesian analysis of contingency tables

# load data
load("input/chapek9.Rdata")

# running cross-tabulation
crosstab <- xtabs(~ species + choice, data = chapek9)
crosstab
##        choice
## species puppy flower data
##   robot    13     30   44
##   human    15     13   65
# running a chi-square test of association
lsr::associationTest(~ species + choice, data = chapek9)
## 
##      Chi-square test of categorical association
## 
## Variables:   species, choice 
## 
## Hypotheses: 
##    null:        variables are independent of one another
##    alternative: some contingency exists between variables
## 
## Observed contingency table:
##        choice
## species puppy flower data
##   robot    13     30   44
##   human    15     13   65
## 
## Expected contingency table under the null hypothesis:
##        choice
## species puppy flower data
##   robot  13.5   20.8 52.7
##   human  14.5   22.2 56.3
## 
## Test results: 
##    X-squared statistic:  10.722 
##    degrees of freedom:  2 
##    p-value:  0.005 
## 
## Other information: 
##    estimated effect size (Cramer's v):  0.244
# running the analysis of the contingency table the Bayesian way (joint multinomial sampling)
BayesFactor::contingencyTableBF(crosstab, sampleType = "jointMulti")
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 15.92684 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, joint multinomial
# with Poisson sampling plan
BayesFactor::contingencyTableBF(crosstab, sampleType = "poisson")
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 28.20757 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, poisson
# with independent multinomial plan
BayesFactor::contingencyTableBF(crosstab, sampleType = "indepMulti", fixedMargin = "rows")
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 8.605897 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, independent multinomial
# with experimental design plan (just for small contingency tables = 2X2)
# BayesFactor::contingencyTableBF(crosstab, sampleType = "hypergeom")

5.8.2 Bayesian t-tests

5.8.2.1 Independent samples t-test

# running an independent samples t-test
lsr::independentSamplesTTest(formula = grade ~ tutor, data = harpo, var.equal = TRUE)
## 
##    Student's independent samples t-test 
## 
## Outcome variable:   grade 
## Grouping variable:  tutor 
## 
## Descriptive statistics: 
##             Anastasia Bernadette
##    mean        74.533     69.056
##    std dev.     8.999      5.775
## 
## Hypotheses: 
##    null:        population means equal for both groups
##    alternative: different population means in each group
## 
## Test results: 
##    t-statistic:  2.115 
##    degrees of freedom:  31 
##    p-value:  0.043 
## 
## Other information: 
##    two-sided 95% confidence interval:  [0.197, 10.759] 
##    estimated effect size (Cohen's d):  0.74
# running an independent samples t-test the Bayesian way
BayesFactor::ttestBF(formula = grade ~ tutor, data = harpo)
## Bayes factor analysis
## --------------
## [1] Alt., r=0.707 : 1.754927 ±0.01%
## 
## Against denominator:
##   Null, mu1-mu2 = 0 
## ---
## Bayes factor type: BFindepSample, JZS

5.8.2.2 Paired samples t-test

# running an paired samples t-test the Bayesian way
BayesFactor::ttestBF(x = chico$grade_test1, y = chico$grade_test2, paired = TRUE)
## Bayes factor analysis
## --------------
## [1] Alt., r=0.707 : 5992.05 ±0%
## 
## Against denominator:
##   Null, mu = 0 
## ---
## Bayes factor type: BFoneSample, JZS

5.8.3 Bayesian regression

# running regression the Bayesian way
BayesFactor::regressionBF(formula = dan.grump ~ dan.sleep + day + baby.sleep, data = parenthood)
## Bayes factor analysis
## --------------
## [1] dan.sleep                    : 16225449751311842697519497320857600 ±0.01%
## [2] day                          : 0.2724027                           ±0%
## [3] baby.sleep                   : 10018411                            ±0%
## [4] dan.sleep + day              : 1016576306925936453003653087756288  ±0%
## [5] dan.sleep + baby.sleep       : 977021991542030776556636533161984   ±0%
## [6] day + baby.sleep             : 2340755                             ±0%
## [7] dan.sleep + day + baby.sleep : 78356247747947433341389794967552    ±0%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS
# narrowing the models
models <- BayesFactor::regressionBF(formula = dan.grump ~ dan.sleep + day + baby.sleep, data = parenthood)
head(models, n = 3)
## Bayes factor analysis
## --------------
## [1] dan.sleep              : 16225449751311842697519497320857600 ±0.01%
## [2] dan.sleep + day        : 1016576306925936453003653087756288  ±0%
## [3] dan.sleep + baby.sleep : 977021991542030776556636533161984   ±0%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS
# knowing the difference between best models
head(models / max(models), n = 3)
## Bayes factor analysis
## --------------
## [1] dan.sleep              : 1         ±0%
## [2] dan.sleep + day        : 0.0626532 ±0.01%
## [3] dan.sleep + baby.sleep : 0.0602154 ±0.01%
## 
## Against denominator:
##   dan.grump ~ dan.sleep 
## ---
## Bayes factor type: BFlinearModel, JZS
# calculating the difference between the two best models (row 1 / row 4 in the original output)
models[1] / models[4]
## Bayes factor analysis
## --------------
## [1] dan.sleep : 15.96088 ±0.01%
## 
## Against denominator:
##   dan.grump ~ dan.sleep + day 
## ---
## Bayes factor type: BFlinearModel, JZS
# specifying a preferred model
BayesFactor::regressionBF(formula = dan.grump ~ dan.sleep + baby.sleep, data = parenthood, whichModels = "top")
## Bayes factor top-down analysis
## --------------
## When effect is omitted from dan.sleep + baby.sleep , BF is...
## [1] Omit baby.sleep : 16.60705                           ±0.01%
## [2] Omit dan.sleep  : 0.00000000000000000000000001025403 ±0.01%
## 
## Against denominator:
##   dan.grump ~ dan.sleep + baby.sleep 
## ---
## Bayes factor type: BFlinearModel, JZS

5.8.4 Bayesian ANOVA

# running ANOVA the Bayesian way
modelsANOVA <- anovaBF(formula = mood.gain ~ drug * therapy, data = clin.trial)
modelsANOVA
## Bayes factor analysis
## --------------
## [1] drug                          : 245.9026  ±0%
## [2] therapy                       : 0.7316007 ±0%
## [3] drug + therapy                : 707.9575  ±2.64%
## [4] drug + therapy + drug:therapy : 695.9421  ±2.55%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS
# identifying the best model
modelsANOVA / max(modelsANOVA)
## Bayes factor analysis
## --------------
## [1] drug                          : 0.3473409   ±2.64%
## [2] therapy                       : 0.001033396 ±2.64%
## [3] drug + therapy                : 1           ±0%
## [4] drug + therapy + drug:therapy : 0.9830281   ±3.67%
## 
## Against denominator:
##   mood.gain ~ drug + therapy 
## ---
## Bayes factor type: BFlinearModel, JZS
# getting BF for a Type II ANOVA
max(modelsANOVA) / modelsANOVA
##                 denominator
## numerator            drug  therapy drug + therapy drug + therapy + drug:therapy
##   drug + therapy 2.879016 967.6829              1                      1.017265

6 Linting

The code in this RMarkdown is linted with the lintr package, which is based on the tidyverse style guide.

# lintr::lint("main.Rmd", linters =
#               lintr::with_defaults(
#                 commented_code_linter = NULL,
#                 trailing_whitespace_linter = NULL
#                 )
#             )
# if you have additional scripts and want them to be linted too, add them here
# lintr::lint("scripts/my_script.R")